[BOLT][AArch64] Support FEAT_CMPBR branch instructions. (#174972)

The Armv9.6-A compare-and-branch instructions use a short range 9-bit
immediate value. They do not have a corresponding relocation type in the
ABI. For now we only support them in compact code model, with
diagnostics added in the LongJmp pass to ensure this condition. Some
interesting edge cases we cover:
- function splitting works when target is within or beyond the 1KB range
of those instructions,
 - but doesn't work beyond the 128MB limit of the compact code model
- branch inversion works with block reordering so long as the immediate
value adjustments remain in bounds
This commit is contained in:
Alexandros Lamprineas 2026-02-12 15:49:00 +00:00 committed by GitHub
parent ab7a6e6b4e
commit 0584699c11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 463 additions and 12 deletions

View File

@ -1747,6 +1747,12 @@ public:
return false;
}
/// AArch64 uses this to perform diagnostics in the LongJmp pass.
virtual bool isShortRangeBranch(const MCInst &Inst) const {
llvm_unreachable("not implemented");
return false;
}
/// Receives a list of MCInst of the basic block to analyze and interpret the
/// terminators of this basic block. TBB must be initialized with the original
/// fall-through for this BB.

View File

@ -73,6 +73,13 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
}
static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
if (BC.isAArch64() && BC.MIB->isShortRangeBranch(Inst) &&
!opts::CompactCodeModel) {
BC.errs() << "BOLT-ERROR: short range branch not supported"
<< " outside compact code model\n";
BC.printInstruction(BC.errs(), Inst);
exit(1);
}
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
}

View File

@ -716,6 +716,55 @@ public:
return Insts;
}
bool isCompAndBranch(const MCInst &Inst) const {
const unsigned Opcode = Inst.getOpcode();
switch (Opcode) {
// Compare register with immediate and branch.
case AArch64::CBGTWri:
case AArch64::CBGTXri:
case AArch64::CBLTWri:
case AArch64::CBLTXri:
case AArch64::CBHIWri:
case AArch64::CBHIXri:
case AArch64::CBLOWri:
case AArch64::CBLOXri:
case AArch64::CBEQWri:
case AArch64::CBEQXri:
case AArch64::CBNEWri:
case AArch64::CBNEXri:
// Compare registers and branch.
case AArch64::CBGTWrr:
case AArch64::CBGTXrr:
case AArch64::CBGEWrr:
case AArch64::CBGEXrr:
case AArch64::CBHIWrr:
case AArch64::CBHIXrr:
case AArch64::CBHSWrr:
case AArch64::CBHSXrr:
case AArch64::CBEQWrr:
case AArch64::CBEQXrr:
case AArch64::CBNEWrr:
case AArch64::CBNEXrr:
// Compare bytes and branch.
case AArch64::CBBGTWrr:
case AArch64::CBBGEWrr:
case AArch64::CBBHIWrr:
case AArch64::CBBHSWrr:
case AArch64::CBBEQWrr:
case AArch64::CBBNEWrr:
// Compare halfwords and branch.
case AArch64::CBHGTWrr:
case AArch64::CBHGEWrr:
case AArch64::CBHHIWrr:
case AArch64::CBHHSWrr:
case AArch64::CBHEQWrr:
case AArch64::CBHNEWrr:
return true;
default:
return false;
}
}
bool isTB(const MCInst &Inst) const {
return (Inst.getOpcode() == AArch64::TBNZW ||
Inst.getOpcode() == AArch64::TBNZX ||
@ -1260,7 +1309,7 @@ public:
if (isConditionalBranch(Inst) || isADR(Inst) || isADRP(Inst) ||
isMOVW(Inst))
OpNum = 1;
if (isTB(Inst) || isAddXri(Inst))
if (isTB(Inst) || isAddXri(Inst) || isCompAndBranch(Inst))
OpNum = 2;
}
@ -1329,7 +1378,7 @@ public:
++OI;
}
if (isTB(Inst)) {
if (isTB(Inst) || isCompAndBranch(Inst)) {
assert(MCPlus::getNumPrimeOperands(Inst) >= 3 &&
"Invalid number of operands");
OI = Inst.begin() + 2;
@ -1798,6 +1847,7 @@ public:
}
unsigned getInvertedBranchOpcode(unsigned Opcode) const {
// clang-format off
switch (Opcode) {
default:
llvm_unreachable("Failed to invert branch opcode");
@ -1810,7 +1860,48 @@ public:
case AArch64::CBZX: return AArch64::CBNZX;
case AArch64::CBNZW: return AArch64::CBZW;
case AArch64::CBNZX: return AArch64::CBZX;
// Compare register with immediate and branch.
case AArch64::CBGTWri: return AArch64::CBLTWri; // +1
case AArch64::CBGTXri: return AArch64::CBLTXri; // +1
case AArch64::CBLTWri: return AArch64::CBGTWri; // -1
case AArch64::CBLTXri: return AArch64::CBGTXri; // -1
case AArch64::CBHIWri: return AArch64::CBLOWri; // +1
case AArch64::CBHIXri: return AArch64::CBLOXri; // +1
case AArch64::CBLOWri: return AArch64::CBHIWri; // -1
case AArch64::CBLOXri: return AArch64::CBHIXri; // -1
case AArch64::CBEQWri: return AArch64::CBNEWri;
case AArch64::CBEQXri: return AArch64::CBNEXri;
case AArch64::CBNEWri: return AArch64::CBEQWri;
case AArch64::CBNEXri: return AArch64::CBEQXri;
// Compare registers and branch.
case AArch64::CBGTWrr: return AArch64::CBGEWrr; // swap
case AArch64::CBGTXrr: return AArch64::CBGEXrr; // swap
case AArch64::CBGEWrr: return AArch64::CBGTWrr; // swap
case AArch64::CBGEXrr: return AArch64::CBGTXrr; // swap
case AArch64::CBHIWrr: return AArch64::CBHSWrr; // swap
case AArch64::CBHIXrr: return AArch64::CBHSXrr; // swap
case AArch64::CBHSWrr: return AArch64::CBHIWrr; // swap
case AArch64::CBHSXrr: return AArch64::CBHIXrr; // swap
case AArch64::CBEQWrr: return AArch64::CBNEWrr;
case AArch64::CBEQXrr: return AArch64::CBNEXrr;
case AArch64::CBNEWrr: return AArch64::CBEQWrr;
case AArch64::CBNEXrr: return AArch64::CBEQXrr;
// Compare bytes and branch.
case AArch64::CBBGTWrr: return AArch64::CBBGEWrr; // swap
case AArch64::CBBGEWrr: return AArch64::CBBGTWrr; // swap
case AArch64::CBBHIWrr: return AArch64::CBBHSWrr; // swap
case AArch64::CBBHSWrr: return AArch64::CBBHIWrr; // swap
case AArch64::CBBEQWrr: return AArch64::CBBNEWrr;
case AArch64::CBBNEWrr: return AArch64::CBBEQWrr;
// Compare halfwords and branch.
case AArch64::CBHGTWrr: return AArch64::CBHGEWrr; // swap
case AArch64::CBHGEWrr: return AArch64::CBHGTWrr; // swap
case AArch64::CBHHIWrr: return AArch64::CBHHSWrr; // swap
case AArch64::CBHHSWrr: return AArch64::CBHHIWrr; // swap
case AArch64::CBHEQWrr: return AArch64::CBHNEWrr;
case AArch64::CBHNEWrr: return AArch64::CBHEQWrr;
}
// clang-format on
}
unsigned getCondCode(const MCInst &Inst) const override {
@ -1830,11 +1921,89 @@ public:
}
}
bool needsRegSwap(unsigned Opcode) const {
switch (Opcode) {
default:
return false;
// Compare registers and branch.
case AArch64::CBGTWrr:
case AArch64::CBGTXrr:
case AArch64::CBGEWrr:
case AArch64::CBGEXrr:
case AArch64::CBHIWrr:
case AArch64::CBHIXrr:
case AArch64::CBHSWrr:
case AArch64::CBHSXrr:
// Compare bytes and branch.
case AArch64::CBBGTWrr:
case AArch64::CBBGEWrr:
case AArch64::CBBHIWrr:
case AArch64::CBBHSWrr:
// Compare halfwords and branch.
case AArch64::CBHGTWrr:
case AArch64::CBHGEWrr:
case AArch64::CBHHIWrr:
case AArch64::CBHHSWrr:
return true;
}
}
bool needsImmDec(unsigned Opcode) const {
switch (Opcode) {
default:
return false;
case AArch64::CBGTWri:
case AArch64::CBGTXri:
case AArch64::CBHIWri:
case AArch64::CBHIXri:
return true;
}
}
bool needsImmInc(unsigned Opcode) const {
switch (Opcode) {
default:
return false;
case AArch64::CBLTWri:
case AArch64::CBLTXri:
case AArch64::CBLOWri:
case AArch64::CBLOXri:
return true;
}
}
bool isReversibleBranch(const MCInst &Inst) const override {
if (isCompAndBranch(Inst)) {
unsigned InvertedOpcode = getInvertedBranchOpcode(Inst.getOpcode());
if (needsImmDec(InvertedOpcode) && Inst.getOperand(1).getImm() == 0)
return false;
if (needsImmInc(InvertedOpcode) && Inst.getOperand(1).getImm() == 63)
return false;
}
return MCPlusBuilder::isReversibleBranch(Inst);
}
void reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
MCContext *Ctx) const override {
if (isTB(Inst) || isCB(Inst)) {
Inst.setOpcode(getInvertedBranchOpcode(Inst.getOpcode()));
if (!isReversibleBranch(Inst)) {
errs() << "BOLT-ERROR: Cannot reverse branch " << Inst << "\n";
exit(1);
}
if (isTB(Inst) || isCB(Inst) || isCompAndBranch(Inst)) {
unsigned InvertedOpcode = getInvertedBranchOpcode(Inst.getOpcode());
Inst.setOpcode(InvertedOpcode);
assert(Inst.getOpcode() != 0 && "Invalid branch instruction");
// The FEAT_CMPBR compare-and-branch instructions cannot encode all
// the possible condition codes, therefore we either have to adjust
// the immediate value by +-1, or to swap the register operands
// when reversing the branch condition.
if (needsRegSwap(InvertedOpcode))
std::swap(Inst.getOperand(0), Inst.getOperand(1));
else if (needsImmDec(InvertedOpcode))
Inst.getOperand(1).setImm(Inst.getOperand(1).getImm() - 1);
else if (needsImmInc(InvertedOpcode))
Inst.getOperand(1).setImm(Inst.getOperand(1).getImm() + 1);
} else if (Inst.getOpcode() == AArch64::Bcc) {
Inst.getOperand(0).setImm(AArch64CC::getInvertedCondCode(
static_cast<AArch64CC::CondCode>(Inst.getOperand(0).getImm())));
@ -1849,18 +2018,16 @@ public:
}
int getPCRelEncodingSize(const MCInst &Inst) const override {
if (isCompAndBranch(Inst))
return 11;
if (isTB(Inst))
return 16;
if (isCB(Inst))
return 21;
switch (Inst.getOpcode()) {
default:
llvm_unreachable("Failed to get pcrel encoding size");
return 0;
case AArch64::TBZW: return 16;
case AArch64::TBZX: return 16;
case AArch64::TBNZW: return 16;
case AArch64::TBNZX: return 16;
case AArch64::CBZW: return 21;
case AArch64::CBZX: return 21;
case AArch64::CBNZW: return 21;
case AArch64::CBNZX: return 21;
case AArch64::B: return 28;
case AArch64::BL: return 28;
case AArch64::Bcc: return 21;
@ -2242,6 +2409,10 @@ public:
convertJmpToTailCall(Inst);
}
bool isShortRangeBranch(const MCInst &Inst) const override {
return isCompAndBranch(Inst);
}
bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
const MCSymbol *&TBB, const MCSymbol *&FBB,
MCInst *&CondBranch,

View File

@ -0,0 +1,108 @@
# This test checks that branch inversion works when reordering blocks which
# contain short range conditional branches. Handles edge cases, like when
# the immediate value is the upper or lower allowed value in which case the
# transformation bails.
# REQUIRES: system-linux, asserts
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: llvm-bolt -v=1 %t -o %t.bolt --data %t.fdata --reorder-blocks=ext-tsp --compact-code-model \
# RUN: | FileCheck %s --check-prefix=BOLT-INFO
# RUN: llvm-objdump -d %t.bolt | FileCheck %s
# CHECK: Disassembly of section .text:
.globl immediate_increment
.type immediate_increment, %function
immediate_increment:
.entry0:
# FDATA: 1 immediate_increment #.entry0# 10
cbgt x0, #0, .exit0
.cold0:
# FDATA: 1 immediate_increment #.cold0# 1
mov x0, #1
ret
.exit0:
# FDATA: 1 immediate_increment #.exit0# 10
mov x0, #2
ret
# CHECK: <immediate_increment>:
# CHECK-NEXT: {{.*}} cblt x0, #0x1, 0x[[ADDR0:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: {{.*}} mov x0, #0x2 // =2
# CHECK-NEXT: {{.*}} ret
# CHECK-NEXT: [[ADDR0]]: {{.*}} mov x0, #0x1 // =1
# CHECK-NEXT: {{.*}} ret
.globl immediate_decrement
.type immediate_decrement, %function
immediate_decrement:
.entry1:
# FDATA: 1 immediate_decrement #.entry1# 10
cblo x0, #1, .exit1
.cold1:
# FDATA: 1 immediate_decrement #.cold1# 1
mov x0, #1
ret
.exit1:
# FDATA: 1 immediate_decrement #.exit1# 10
mov x0, #2
ret
# CHECK: <immediate_decrement>:
# CHECK-NEXT: {{.*}} cbhi x0, #0x0, 0x[[ADDR1:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: {{.*}} mov x0, #0x2 // =2
# CHECK-NEXT: {{.*}} ret
# CHECK-NEXT: [[ADDR1]]: {{.*}} mov x0, #0x1 // =1
# CHECK-NEXT: {{.*}} ret
.globl register_swap
.type register_swap, %function
register_swap:
.entry2:
# FDATA: 1 register_swap #.entry2# 10
cbge x0, x1, .exit2
.cold2:
# FDATA: 1 register_swap #.cold2# 1
mov x0, #1
ret
.exit2:
# FDATA: 1 register_swap #.exit2# 10
mov x0, #2
ret
# CHECK: <register_swap>:
# CHECK-NEXT: {{.*}} cbgt x1, x0, 0x[[ADDR2:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: {{.*}} mov x0, #0x2 // =2
# CHECK-NEXT: {{.*}} ret
# CHECK-NEXT: [[ADDR2]]: {{.*}} mov x0, #0x1 // =1
# CHECK-NEXT: {{.*}} ret
.globl irreversible
.type irreversible, %function
irreversible:
.entry3:
# FDATA: 1 irreversible #.entry3# 10
cbgt x0, #63, .exit3
.cold3:
# FDATA: 1 irreversible #.cold3# 1
mov x0, #1
ret
.exit3:
# FDATA: 1 irreversible #.exit3# 10
mov x0, #2
ret
# BOLT-INFO: unable to swap successors in irreversible
# CHECK: <irreversible>:
# CHECK-NEXT: {{.*}} cbgt x0, #0x3f, 0x[[ADDR3:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: {{.*}} b 0x[[ADDR4:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: [[ADDR3]]: {{.*}} mov x0, #0x2 // =2
# CHECK-NEXT: {{.*}} ret
# CHECK-NEXT: [[ADDR4]]: {{.*}} mov x0, #0x1 // =1
# CHECK-NEXT: {{.*}} ret
## Force relocation mode.
.reloc 0, R_AARCH64_NONE

View File

@ -0,0 +1,49 @@
# This test checks that reordering blocks which contain short range
# conditional branches may break if the target goes out of range.
# REQUIRES: system-linux, asserts
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=0
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata --reorder-blocks=ext-tsp --compact-code-model
# RUN: llvm-objdump -d %t.bolt | FileCheck %s
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=256
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: not llvm-bolt %t -o %t.bolt --data %t.fdata --reorder-blocks=ext-tsp --compact-code-model --keep-nops 2>&1 \
# RUN: | FileCheck %s --check-prefix=FIXUP_OUT_OF_RANGE
.globl reorder_blocks
.type reorder_blocks, %function
reorder_blocks:
.entry:
# FDATA: 1 reorder_blocks #.entry# 10
cbgt x0, #0, .cold_exit
.fall_through:
# FDATA: 1 reorder_blocks #.fall_through# 10
b .hot_exit
.cold_exit:
# FDATA: 1 reorder_blocks #.cold_exit# 1
mov x0, #1
ret
.hot_exit:
# FDATA: 1 reorder_blocks #.hot_exit# 10
.rept NUM_NOPS
nop
.endr
mov x0, #2
ret
## Force relocation mode.
.reloc 0, R_AARCH64_NONE
# CHECK: Disassembly of section .text:
# CHECK: <reorder_blocks>:
# CHECK-NEXT: {{.*}} cbgt x0, #0x0, 0x[[ADDR:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: {{.*}} mov x0, #0x2 // =2
# CHECK-NEXT: {{.*}} ret
# CHECK-NEXT: [[ADDR]]: {{.*}} mov x0, #0x1 // =1
# CHECK-NEXT: {{.*}} ret
# FIXUP_OUT_OF_RANGE: error: fixup value out of range

View File

@ -0,0 +1,62 @@
# This test checks that splitting functions which contain short range
# conditional branches works in compact code model without relying on
# relocations.
# REQUIRES: system-linux, asserts
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=0 -DRESERVE_SPACE=0
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions --compact-code-model
# RUN: llvm-objdump -d %t.bolt | FileCheck %s
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=256 -DRESERVE_SPACE=0
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions --compact-code-model --keep-nops
# RUN: llvm-objdump -d %t.bolt | FileCheck %s
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=0 -DRESERVE_SPACE=1
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: not llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions --compact-code-model 2>&1 \
# RUN: | FileCheck %s --check-prefix=BEYOND-128MB
.globl foo
.type foo, %function
foo:
.entry_foo:
# FDATA: 1 foo #.entry_foo# 10
cbgt x0, #0, .Lcold_foo
mov x0, #1
.Lcold_foo:
ret
.globl large_function
.type large_function, %function
large_function:
.entry_large_function:
# FDATA: 1 large_function #.entry_large_function# 10
.rept NUM_NOPS
nop
.endr
ret
.if RESERVE_SPACE
.space 0x8000000
.endif
## Force relocation mode.
.reloc 0, R_AARCH64_NONE
# CHECK: Disassembly of section .text:
# CHECK: <foo>:
# CHECK-NEXT: {{.*}} cbgt x0, #0x0, 0x[[ADDR0:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: {{.*}} b 0x[[ADDR1:[0-9a-f]+]] <{{.*}}>
# CHECK-NEXT: [[ADDR0]]: {{.*}} b 0x[[ADDR2:[0-9a-f]+]] <{{.*}}>
# CHECK: Disassembly of section .text.cold:
# CHECK: <foo.cold.0>:
# CHECK-NEXT: [[ADDR1]]: {{.*}} mov x0, #0x1 // =1
# CHECK-NEXT: [[ADDR2]]: {{.*}} ret
# BEYOND-128MB: BOLT-ERROR: JITLink failed: In graph in-memory object file, section .text: relocation target {{0x[0-9a-f]+}} (<anonymous symbol>) is out of range of Branch26PCRel fixup at address {{0x[0-9a-f]+}}

View File

@ -0,0 +1,48 @@
# This test checks that splitting functions which contain short range
# conditional branches does not work outside compact code model.
# REQUIRES: system-linux, asserts
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=0 -DRESERVE_SPACE=0
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: not llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions 2>&1 \
# RUN: | FileCheck %s
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=256 -DRESERVE_SPACE=0
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: not llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions --keep-nops 2>&1 \
# RUN: | FileCheck %s
# RUN: %clang %cflags -march=armv9-a+cmpbr -Wl,-q %s -o %t -DNUM_NOPS=0 -DRESERVE_SPACE=1
# RUN: link_fdata --no-lbr %s %t %t.fdata
# RUN: not llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions 2>&1 \
# RUN: | FileCheck %s
.globl foo
.type foo, %function
foo:
.entry_foo:
# FDATA: 1 foo #.entry_foo# 10
cbgt x0, #0, .Lcold_foo
mov x0, #1
.Lcold_foo:
ret
.globl large_function
.type large_function, %function
large_function:
.entry_large_function:
# FDATA: 1 large_function #.entry_large_function# 10
.rept NUM_NOPS
nop
.endr
ret
.if RESERVE_SPACE
.space 0x8000000
.endif
## Force relocation mode.
.reloc 0, R_AARCH64_NONE
# CHECK: BOLT-ERROR: short range branch not supported outside compact code model