[BOLT][AArch64] Inlining of Memcpy (#154929)
The pass for inlining memcpy in BOLT was currently X86-specific and was using the instruction `rep movsb`. This patch implements a static size analysis system for AArch64 memcpy inlining that extracts copy sizes from preceding instructions to then use it to generate the optimal width-specific load/store sequences.
This commit is contained in:
parent
872d2c90be
commit
244588b9d7
@ -637,7 +637,7 @@
|
||||
|
||||
- `--inline-memcpy`
|
||||
|
||||
Inline memcpy using 'rep movsb' instruction (X86-only)
|
||||
Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
|
||||
|
||||
- `--inline-small-functions`
|
||||
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#ifndef BOLT_CORE_MCPLUSBUILDER_H
|
||||
#define BOLT_CORE_MCPLUSBUILDER_H
|
||||
|
||||
#include "bolt/Core/BinaryBasicBlock.h"
|
||||
#include "bolt/Core/MCPlus.h"
|
||||
#include "bolt/Core/Relocation.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
@ -1902,6 +1903,15 @@ public:
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Find memcpy size in bytes by using preceding instructions.
|
||||
/// Returns std::nullopt if size cannot be determined (no-op for most
|
||||
/// targets).
|
||||
virtual std::optional<uint64_t>
|
||||
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
|
||||
BinaryBasicBlock::iterator CallInst) const {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
|
||||
/// (dest + n) instead of dest.
|
||||
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
|
||||
@ -1909,6 +1919,22 @@ public:
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
|
||||
/// generates optimized code for that specific size. Falls back to regular
|
||||
/// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
|
||||
virtual InstructionListType
|
||||
createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
|
||||
return createInlineMemcpy(ReturnEnd);
|
||||
}
|
||||
|
||||
/// Extract immediate value from move instruction that sets the given
|
||||
/// register. Returns the immediate value if the instruction is a
|
||||
/// move-immediate to TargetReg.
|
||||
virtual std::optional<uint64_t>
|
||||
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/// Create a target-specific relocation out of the \p Fixup.
|
||||
/// Note that not every fixup could be converted into a relocation.
|
||||
virtual std::optional<Relocation>
|
||||
|
||||
@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
|
||||
}
|
||||
|
||||
Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
if (!BC.isX86() && !BC.isAArch64())
|
||||
return Error::success();
|
||||
|
||||
uint64_t NumInlined = 0;
|
||||
@ -1866,8 +1866,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
|
||||
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
|
||||
const bool IsTailCall = BC.MIB->isTailCall(Inst);
|
||||
|
||||
// Extract size from preceding instructions (AArch64 only).
|
||||
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
|
||||
std::optional<uint64_t> KnownSize =
|
||||
BC.MIB->findMemcpySizeInBytes(BB, II);
|
||||
|
||||
if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
|
||||
continue;
|
||||
|
||||
const InstructionListType NewCode =
|
||||
BC.MIB->createInlineMemcpy(IsMemcpy8);
|
||||
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
|
||||
II = BB.replaceInstruction(II, NewCode);
|
||||
std::advance(II, NewCode.size() - 1);
|
||||
if (IsTailCall) {
|
||||
|
||||
@ -248,7 +248,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
|
||||
|
||||
static cl::opt<bool> StringOps(
|
||||
"inline-memcpy",
|
||||
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
|
||||
cl::desc(
|
||||
"inline memcpy using size-specific optimized instructions "
|
||||
"(X86: 'rep movsb', AArch64: width-optimized register operations)"),
|
||||
cl::cat(BoltOptCategory));
|
||||
|
||||
static cl::opt<bool> StripRepRet(
|
||||
|
||||
@ -2620,6 +2620,122 @@ public:
|
||||
getInstructionSize(const MCInst &Inst) const override {
|
||||
return 4;
|
||||
}
|
||||
|
||||
std::optional<uint64_t>
|
||||
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
|
||||
// Match MOVZ instructions (both X and W register variants) with no shift.
|
||||
if ((Inst.getOpcode() == AArch64::MOVZXi ||
|
||||
Inst.getOpcode() == AArch64::MOVZWi) &&
|
||||
Inst.getOperand(2).getImm() == 0 &&
|
||||
getAliases(TargetReg)[Inst.getOperand(0).getReg()])
|
||||
return Inst.getOperand(1).getImm();
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<uint64_t>
|
||||
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
|
||||
BinaryBasicBlock::iterator CallInst) const override {
|
||||
MCPhysReg SizeReg = getIntArgRegister(2);
|
||||
if (SizeReg == getNoRegister())
|
||||
return std::nullopt;
|
||||
|
||||
BitVector WrittenRegs(RegInfo->getNumRegs());
|
||||
const BitVector &SizeRegAliases = getAliases(SizeReg);
|
||||
|
||||
for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
|
||||
const MCInst &Inst = *InstIt;
|
||||
WrittenRegs.reset();
|
||||
getWrittenRegs(Inst, WrittenRegs);
|
||||
|
||||
if (WrittenRegs.anyCommon(SizeRegAliases))
|
||||
return extractMoveImmediate(Inst, SizeReg);
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
InstructionListType
|
||||
createInlineMemcpy(bool ReturnEnd,
|
||||
std::optional<uint64_t> KnownSize) const override {
|
||||
assert(KnownSize.has_value() &&
|
||||
"AArch64 memcpy inlining requires known size");
|
||||
InstructionListType Code;
|
||||
uint64_t Size = *KnownSize;
|
||||
|
||||
generateSizeSpecificMemcpy(Code, Size);
|
||||
|
||||
// If _memcpy8, adjust X0 to return dest+size instead of dest.
|
||||
if (ReturnEnd)
|
||||
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
|
||||
.addReg(AArch64::X0)
|
||||
.addReg(AArch64::X0)
|
||||
.addImm(Size)
|
||||
.addImm(0));
|
||||
return Code;
|
||||
}
|
||||
|
||||
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
|
||||
uint64_t Size) const {
|
||||
auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
|
||||
unsigned Reg, unsigned Offset = 0) {
|
||||
Code.emplace_back(MCInstBuilder(LoadOpc)
|
||||
.addReg(Reg)
|
||||
.addReg(AArch64::X1)
|
||||
.addImm(Offset));
|
||||
Code.emplace_back(MCInstBuilder(StoreOpc)
|
||||
.addReg(Reg)
|
||||
.addReg(AArch64::X0)
|
||||
.addImm(Offset));
|
||||
};
|
||||
|
||||
// Generate optimal instruction sequences based on exact size.
|
||||
switch (Size) {
|
||||
case 1:
|
||||
AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
|
||||
break;
|
||||
case 2:
|
||||
AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
|
||||
break;
|
||||
case 4:
|
||||
AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
|
||||
break;
|
||||
case 8:
|
||||
AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
|
||||
break;
|
||||
case 16:
|
||||
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
|
||||
break;
|
||||
case 32:
|
||||
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
|
||||
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
|
||||
break;
|
||||
|
||||
default:
|
||||
// For sizes up to 64 bytes, greedily use the largest possible loads.
|
||||
// Caller should have already filtered out sizes > 64 bytes.
|
||||
assert(Size <= 64 &&
|
||||
"Size should be <= 64 bytes for AArch64 memcpy inlining");
|
||||
|
||||
uint64_t Remaining = Size;
|
||||
uint64_t Offset = 0;
|
||||
|
||||
const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
|
||||
LoadStoreOps = {
|
||||
{{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
|
||||
{8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
|
||||
{4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
|
||||
{2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
|
||||
{1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
|
||||
|
||||
for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
|
||||
while (Remaining >= OpSize) {
|
||||
AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
|
||||
Remaining -= OpSize;
|
||||
Offset += OpSize;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return Code;
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
384
bolt/test/runtime/AArch64/inline-memcpy.s
Normal file
384
bolt/test/runtime/AArch64/inline-memcpy.s
Normal file
@ -0,0 +1,384 @@
|
||||
## This test checks that BOLT correctly inlines memcpy calls on AArch64.
|
||||
|
||||
# REQUIRES: system-linux, aarch64-registered-target
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
|
||||
# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
|
||||
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
|
||||
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
|
||||
|
||||
# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
|
||||
# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
|
||||
|
||||
# Each function should use optimal size-specific instructions and NO memcpy calls
|
||||
|
||||
# 1-byte copy should use single byte load/store (ldrb/strb)
|
||||
# CHECK-ASM-LABEL: <test_1_byte_direct>:
|
||||
# CHECK-ASM: ldrb{{.*}}w9, [x1]
|
||||
# CHECK-ASM-NEXT: strb{{.*}}w9, [x0]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 2-byte copy should use single 16-bit load/store (ldrh/strh)
|
||||
# CHECK-ASM-LABEL: <test_2_byte_direct>:
|
||||
# CHECK-ASM: ldrh{{.*}}w9, [x1]
|
||||
# CHECK-ASM-NEXT: strh{{.*}}w9, [x0]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 4-byte copy should use single 32-bit load/store (w register)
|
||||
# CHECK-ASM-LABEL: <test_4_byte_direct>:
|
||||
# CHECK-ASM: ldr{{.*}}w9, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 8-byte copy should use single 64-bit load/store (x register)
|
||||
# CHECK-ASM-LABEL: <test_8_byte_direct>:
|
||||
# CHECK-ASM: ldr{{.*}}x9, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 16-byte copy should use single 128-bit SIMD load/store (q register)
|
||||
# CHECK-ASM-LABEL: <test_16_byte_direct>:
|
||||
# CHECK-ASM: ldr{{.*}}q16, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 32-byte copy should use two 128-bit SIMD operations
|
||||
# CHECK-ASM-LABEL: <test_32_byte_direct>:
|
||||
# CHECK-ASM: ldr{{.*}}q16, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
|
||||
# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
|
||||
# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
|
||||
# CHECK-ASM: ldr{{.*}}q16, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
|
||||
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
|
||||
# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20]
|
||||
# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20]
|
||||
# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24]
|
||||
# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
|
||||
# CHECK-ASM-LABEL: <test_0_byte>:
|
||||
# CHECK-ASM-NOT: ldr
|
||||
# CHECK-ASM-NOT: str
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# Negative size should NOT be inlined (invalid size parameter)
|
||||
# CHECK-ASM-LABEL: <test_negative_size>:
|
||||
# CHECK-ASM: bl{{.*}}<memcpy
|
||||
|
||||
# 128-byte copy should NOT be inlined (too large, original call preserved)
|
||||
# CHECK-ASM-LABEL: <test_128_byte_too_large>:
|
||||
# CHECK-ASM: bl{{.*}}<memcpy
|
||||
|
||||
# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
|
||||
# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
|
||||
# CHECK-ASM: bl{{.*}}<memcpy
|
||||
|
||||
# Register move should NOT be inlined (size unknown at compile time)
|
||||
# CHECK-ASM-LABEL: <test_register_move_negative>:
|
||||
# CHECK-ASM: bl{{.*}}<memcpy
|
||||
|
||||
# Live-in parameter should NOT be inlined (size unknown at compile time)
|
||||
# CHECK-ASM-LABEL: <test_live_in_negative>:
|
||||
# CHECK-ASM: bl{{.*}}<memcpy
|
||||
|
||||
# _memcpy8 should be inlined with end-pointer return (dest+size)
|
||||
# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
|
||||
# CHECK-ASM: ldr{{.*}}w9, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
|
||||
# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4
|
||||
# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
|
||||
|
||||
# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
|
||||
# CHECK-ASM-LABEL: <complex_operation>:
|
||||
# CHECK-ASM: ldr{{.*}}x9, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
# Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
|
||||
# CHECK-ASM-LABEL: <complex_fp_operation>:
|
||||
# CHECK-ASM: ldr{{.*}}q16, [x1]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
|
||||
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
|
||||
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20]
|
||||
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30]
|
||||
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30]
|
||||
# CHECK-ASM-NOT: bl{{.*}}<memcpy
|
||||
|
||||
.text
|
||||
.globl test_1_byte_direct
|
||||
.type test_1_byte_direct,@function
|
||||
test_1_byte_direct:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #1
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_1_byte_direct, .-test_1_byte_direct
|
||||
|
||||
.globl test_2_byte_direct
|
||||
.type test_2_byte_direct,@function
|
||||
test_2_byte_direct:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #2
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_2_byte_direct, .-test_2_byte_direct
|
||||
|
||||
.globl test_4_byte_direct
|
||||
.type test_4_byte_direct,@function
|
||||
test_4_byte_direct:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #4
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_4_byte_direct, .-test_4_byte_direct
|
||||
|
||||
.globl test_8_byte_direct
|
||||
.type test_8_byte_direct,@function
|
||||
test_8_byte_direct:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #8
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_8_byte_direct, .-test_8_byte_direct
|
||||
|
||||
.globl test_16_byte_direct
|
||||
.type test_16_byte_direct,@function
|
||||
test_16_byte_direct:
|
||||
stp x29, x30, [sp, #-48]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #32
|
||||
mov x2, #16
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #48
|
||||
ret
|
||||
.size test_16_byte_direct, .-test_16_byte_direct
|
||||
|
||||
.globl test_32_byte_direct
|
||||
.type test_32_byte_direct,@function
|
||||
test_32_byte_direct:
|
||||
stp x29, x30, [sp, #-80]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #48
|
||||
mov x2, #32
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #80
|
||||
ret
|
||||
.size test_32_byte_direct, .-test_32_byte_direct
|
||||
|
||||
.globl test_37_byte_arbitrary
|
||||
.type test_37_byte_arbitrary,@function
|
||||
test_37_byte_arbitrary:
|
||||
stp x29, x30, [sp, #-96]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #56
|
||||
mov x2, #37
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #96
|
||||
ret
|
||||
.size test_37_byte_arbitrary, .-test_37_byte_arbitrary
|
||||
|
||||
.globl test_0_byte
|
||||
.type test_0_byte,@function
|
||||
test_0_byte:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #0
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_0_byte, .-test_0_byte
|
||||
|
||||
.globl test_negative_size
|
||||
.type test_negative_size,@function
|
||||
test_negative_size:
|
||||
# Negative size should not be inlined
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #-1
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_negative_size, .-test_negative_size
|
||||
|
||||
.globl test_128_byte_too_large
|
||||
.type test_128_byte_too_large,@function
|
||||
test_128_byte_too_large:
|
||||
stp x29, x30, [sp, #-288]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #152
|
||||
mov x2, #128
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #288
|
||||
ret
|
||||
.size test_128_byte_too_large, .-test_128_byte_too_large
|
||||
|
||||
.globl test_4_byte_add_immediate
|
||||
.type test_4_byte_add_immediate,@function
|
||||
test_4_byte_add_immediate:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x3, #0
|
||||
add x2, x3, #4
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_4_byte_add_immediate, .-test_4_byte_add_immediate
|
||||
|
||||
.globl test_register_move_negative
|
||||
.type test_register_move_negative,@function
|
||||
test_register_move_negative:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x6, #4
|
||||
mov x2, x6
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_register_move_negative, .-test_register_move_negative
|
||||
|
||||
.globl test_live_in_negative
|
||||
.type test_live_in_negative,@function
|
||||
test_live_in_negative:
|
||||
# x2 comes in as parameter, no instruction sets it (should NOT inline)
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
# x2 is live-in, no size-setting instruction
|
||||
bl memcpy
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_live_in_negative, .-test_live_in_negative
|
||||
|
||||
.globl test_memcpy8_4_byte
|
||||
.type test_memcpy8_4_byte,@function
|
||||
test_memcpy8_4_byte:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
mov x29, sp
|
||||
add x1, sp, #16
|
||||
add x0, sp, #8
|
||||
mov x2, #4
|
||||
bl _memcpy8
|
||||
ldp x29, x30, [sp], #32
|
||||
ret
|
||||
.size test_memcpy8_4_byte, .-test_memcpy8_4_byte
|
||||
|
||||
# Simple _memcpy8 implementation that calls memcpy and returns dest+size
|
||||
.globl _memcpy8
|
||||
.type _memcpy8,@function
|
||||
_memcpy8:
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
mov x3, x0
|
||||
bl memcpy
|
||||
add x0, x3, x2
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
.size _memcpy8, .-_memcpy8
|
||||
|
||||
.globl complex_operation
|
||||
.type complex_operation,@function
|
||||
complex_operation:
|
||||
stp x29, x30, [sp, #-32]!
|
||||
str x19, [sp, #16]
|
||||
mov x29, sp
|
||||
ldp x9, x10, [x0]
|
||||
ldp x11, x12, [x0, #16]
|
||||
mov x19, x1
|
||||
mov x8, x0
|
||||
add x0, x1, #32
|
||||
madd x9, x9, x2, x3
|
||||
and x10, x10, x4
|
||||
asr x12, x12, #2
|
||||
mov w2, #8
|
||||
orr x11, x12, x11, lsl #3
|
||||
eor x12, x9, x10
|
||||
mul x10, x11, x10
|
||||
eor x12, x12, x11
|
||||
add x13, x12, x9
|
||||
add x9, x11, x9, asr #4
|
||||
stp x13, x10, [x1]
|
||||
mov w10, w12
|
||||
stp x9, x10, [x1, #16]
|
||||
add x1, x8, #32
|
||||
bl memcpy
|
||||
ldr x0, [x19, #16]
|
||||
ldr x19, [sp, #16]
|
||||
ldp x29, x30, [sp], #32
|
||||
b use
|
||||
.size complex_operation, .-complex_operation
|
||||
|
||||
.globl use
|
||||
.type use,@function
|
||||
use:
|
||||
ret
|
||||
.size use, .-use
|
||||
|
||||
# Same as above but using FP caller-saved registers (Q16/17)
|
||||
.globl complex_fp_operation
|
||||
.type complex_fp_operation,@function
|
||||
complex_fp_operation:
|
||||
stp x29, x30, [sp, #-48]!
|
||||
stp q8, q9, [sp, #16]
|
||||
mov x29, sp
|
||||
ldr q16, [x0]
|
||||
ldr q17, [x0, #16]
|
||||
mov x8, x0
|
||||
add x0, x1, #32
|
||||
fadd v16.4s, v16.4s, v17.4s
|
||||
fmul v17.4s, v16.4s, v17.4s
|
||||
fsub v16.2d, v16.2d, v17.2d
|
||||
mov w2, #64
|
||||
fmax v17.4s, v16.4s, v17.4s
|
||||
fmin v16.2d, v16.2d, v17.2d
|
||||
str q16, [x1]
|
||||
str q17, [x1, #16]
|
||||
add x1, x8, #32
|
||||
bl memcpy
|
||||
ldp q8, q9, [sp, #16]
|
||||
ldp x29, x30, [sp], #48
|
||||
b use_fp
|
||||
.size complex_fp_operation, .-complex_fp_operation
|
||||
|
||||
.globl use_fp
|
||||
.type use_fp,@function
|
||||
use_fp:
|
||||
ret
|
||||
.size use_fp, .-use_fp
|
||||
Loading…
x
Reference in New Issue
Block a user