[BOLT][AArch64] Inlining of Memcpy (#154929)

The pass for inlining memcpy in BOLT was currently X86-specific and was using the instruction `rep movsb`. This patch implements a static size analysis system for AArch64 memcpy inlining that extracts copy sizes from preceding instructions to then use it to generate the optimal width-specific load/store sequences.
2025-09-09 14:09:23 +01:00 · 2025-09-09 14:09:23 +01:00 · 244588b9d7
commit 244588b9d7
parent 872d2c90be
6 changed files with 540 additions and 4 deletions
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@ -637,7 +637,7 @@

 - `--inline-memcpy`

-  Inline memcpy using 'rep movsb' instruction (X86-only)
+  Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)

 - `--inline-small-functions`

--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@ -14,6 +14,7 @@
 #ifndef BOLT_CORE_MCPLUSBUILDER_H
 #define BOLT_CORE_MCPLUSBUILDER_H

+#include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/MCPlus.h"
 #include "bolt/Core/Relocation.h"
 #include "llvm/ADT/ArrayRef.h"
@ -1902,6 +1903,15 @@ public:
    return {};
  }

+  /// Find memcpy size in bytes by using preceding instructions.
+  /// Returns std::nullopt if size cannot be determined (no-op for most
+  /// targets).
+  virtual std::optional<uint64_t>
+  findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+                        BinaryBasicBlock::iterator CallInst) const {
+    return std::nullopt;
+  }
+
  /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
  /// (dest + n) instead of dest.
  virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
@ -1909,6 +1919,22 @@ public:
    return {};
  }

+  /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
+  /// generates optimized code for that specific size. Falls back to regular
+  /// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
+  virtual InstructionListType
+  createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
+    return createInlineMemcpy(ReturnEnd);
+  }
+
+  /// Extract immediate value from move instruction that sets the given
+  /// register. Returns the immediate value if the instruction is a
+  /// move-immediate to TargetReg.
+  virtual std::optional<uint64_t>
+  extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
+    return std::nullopt;
+  }
+
  /// Create a target-specific relocation out of the \p Fixup.
  /// Note that not every fixup could be converted into a relocation.
  virtual std::optional<Relocation>
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
 }

 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
-  if (!BC.isX86())
+  if (!BC.isX86() && !BC.isAArch64())
    return Error::success();

  uint64_t NumInlined = 0;
@ -1866,8 +1866,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
        const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
        const bool IsTailCall = BC.MIB->isTailCall(Inst);

+        // Extract size from preceding instructions (AArch64 only).
+        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
+        std::optional<uint64_t> KnownSize =
+            BC.MIB->findMemcpySizeInBytes(BB, II);
+
+        if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
+          continue;
+
        const InstructionListType NewCode =
-            BC.MIB->createInlineMemcpy(IsMemcpy8);
+            BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
        II = BB.replaceInstruction(II, NewCode);
        std::advance(II, NewCode.size() - 1);
        if (IsTailCall) {
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@ -248,7 +248,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),

 static cl::opt<bool> StringOps(
    "inline-memcpy",
-    cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
+    cl::desc(
+        "inline memcpy using size-specific optimized instructions "
+        "(X86: 'rep movsb', AArch64: width-optimized register operations)"),
    cl::cat(BoltOptCategory));

 static cl::opt<bool> StripRepRet(
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@ -2620,6 +2620,122 @@ public:
  getInstructionSize(const MCInst &Inst) const override {
    return 4;
  }
+
+  std::optional<uint64_t>
+  extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
+    // Match MOVZ instructions (both X and W register variants) with no shift.
+    if ((Inst.getOpcode() == AArch64::MOVZXi ||
+         Inst.getOpcode() == AArch64::MOVZWi) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        getAliases(TargetReg)[Inst.getOperand(0).getReg()])
+      return Inst.getOperand(1).getImm();
+    return std::nullopt;
+  }
+
+  std::optional<uint64_t>
+  findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+                        BinaryBasicBlock::iterator CallInst) const override {
+    MCPhysReg SizeReg = getIntArgRegister(2);
+    if (SizeReg == getNoRegister())
+      return std::nullopt;
+
+    BitVector WrittenRegs(RegInfo->getNumRegs());
+    const BitVector &SizeRegAliases = getAliases(SizeReg);
+
+    for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+      const MCInst &Inst = *InstIt;
+      WrittenRegs.reset();
+      getWrittenRegs(Inst, WrittenRegs);
+
+      if (WrittenRegs.anyCommon(SizeRegAliases))
+        return extractMoveImmediate(Inst, SizeReg);
+    }
+    return std::nullopt;
+  }
+
+  InstructionListType
+  createInlineMemcpy(bool ReturnEnd,
+                     std::optional<uint64_t> KnownSize) const override {
+    assert(KnownSize.has_value() &&
+           "AArch64 memcpy inlining requires known size");
+    InstructionListType Code;
+    uint64_t Size = *KnownSize;
+
+    generateSizeSpecificMemcpy(Code, Size);
+
+    // If _memcpy8, adjust X0 to return dest+size instead of dest.
+    if (ReturnEnd)
+      Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+                            .addReg(AArch64::X0)
+                            .addReg(AArch64::X0)
+                            .addImm(Size)
+                            .addImm(0));
+    return Code;
+  }
+
+  InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
+                                                 uint64_t Size) const {
+    auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+                                unsigned Reg, unsigned Offset = 0) {
+      Code.emplace_back(MCInstBuilder(LoadOpc)
+                            .addReg(Reg)
+                            .addReg(AArch64::X1)
+                            .addImm(Offset));
+      Code.emplace_back(MCInstBuilder(StoreOpc)
+                            .addReg(Reg)
+                            .addReg(AArch64::X0)
+                            .addImm(Offset));
+    };
+
+    // Generate optimal instruction sequences based on exact size.
+    switch (Size) {
+    case 1:
+      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
+      break;
+    case 2:
+      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
+      break;
+    case 4:
+      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
+      break;
+    case 8:
+      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
+      break;
+    case 16:
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
+      break;
+    case 32:
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
+      break;
+
+    default:
+      // For sizes up to 64 bytes, greedily use the largest possible loads.
+      // Caller should have already filtered out sizes > 64 bytes.
+      assert(Size <= 64 &&
+             "Size should be <= 64 bytes for AArch64 memcpy inlining");
+
+      uint64_t Remaining = Size;
+      uint64_t Offset = 0;
+
+      const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+          LoadStoreOps = {
+              {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+               {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+               {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+               {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+               {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
+
+      for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+        while (Remaining >= OpSize) {
+          AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+          Remaining -= OpSize;
+          Offset += OpSize;
+        }
+      break;
+    }
+    return Code;
+  }
 };

 } // end anonymous namespace
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@ -0,0 +1,384 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux, aarch64-registered-target
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w9, [x1]
+# CHECK-ASM-NEXT: strb{{.*}}w9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w9, [x1]
+# CHECK-ASM-NEXT: strh{{.*}}w9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
+# CHECK-ASM-LABEL: <test_0_byte>:
+# CHECK-ASM-NOT: ldr
+# CHECK-ASM-NOT: str
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# Negative size should NOT be inlined (invalid size parameter)
+# CHECK-ASM-LABEL: <test_negative_size>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# 128-byte copy should NOT be inlined (too large, original call preserved)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
+# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Register move should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Live-in parameter should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# _memcpy8 should be inlined with end-pointer return (dest+size)
+# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
+
+# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
+# CHECK-ASM-LABEL: <complex_operation>:
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
+# CHECK-ASM-LABEL: <complex_fp_operation>:
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+	.text
+	.globl	test_1_byte_direct
+	.type	test_1_byte_direct,@function
+test_1_byte_direct:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_1_byte_direct, .-test_1_byte_direct
+
+	.globl	test_2_byte_direct
+	.type	test_2_byte_direct,@function
+test_2_byte_direct:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #2
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_2_byte_direct, .-test_2_byte_direct
+
+	.globl	test_4_byte_direct
+	.type	test_4_byte_direct,@function
+test_4_byte_direct:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_direct, .-test_4_byte_direct
+
+	.globl	test_8_byte_direct
+	.type	test_8_byte_direct,@function
+test_8_byte_direct:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #8
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_8_byte_direct, .-test_8_byte_direct
+
+	.globl	test_16_byte_direct
+	.type	test_16_byte_direct,@function
+test_16_byte_direct:
+	stp	x29, x30, [sp, #-48]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #32
+	mov	x2, #16
+	bl	memcpy
+	ldp	x29, x30, [sp], #48
+	ret
+	.size	test_16_byte_direct, .-test_16_byte_direct
+
+	.globl	test_32_byte_direct
+	.type	test_32_byte_direct,@function
+test_32_byte_direct:
+	stp	x29, x30, [sp, #-80]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #48
+	mov	x2, #32
+	bl	memcpy
+	ldp	x29, x30, [sp], #80
+	ret
+	.size	test_32_byte_direct, .-test_32_byte_direct
+
+	.globl	test_37_byte_arbitrary
+	.type	test_37_byte_arbitrary,@function
+test_37_byte_arbitrary:
+	stp	x29, x30, [sp, #-96]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #56
+	mov	x2, #37
+	bl	memcpy
+	ldp	x29, x30, [sp], #96
+	ret
+	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+	.globl	test_0_byte
+	.type	test_0_byte,@function
+test_0_byte:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #0
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_0_byte, .-test_0_byte
+
+	.globl	test_negative_size
+	.type	test_negative_size,@function
+test_negative_size:
+	# Negative size should not be inlined
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #-1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_negative_size, .-test_negative_size
+
+	.globl	test_128_byte_too_large
+	.type	test_128_byte_too_large,@function
+test_128_byte_too_large:
+	stp	x29, x30, [sp, #-288]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #152
+	mov	x2, #128
+	bl	memcpy
+	ldp	x29, x30, [sp], #288
+	ret
+	.size	test_128_byte_too_large, .-test_128_byte_too_large
+
+	.globl	test_4_byte_add_immediate
+	.type	test_4_byte_add_immediate,@function
+test_4_byte_add_immediate:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x3, #0
+	add	x2, x3, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_add_immediate, .-test_4_byte_add_immediate
+
+	.globl	test_register_move_negative
+	.type	test_register_move_negative,@function
+test_register_move_negative:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x6, #4
+	mov	x2, x6
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_register_move_negative, .-test_register_move_negative
+
+	.globl	test_live_in_negative
+	.type	test_live_in_negative,@function
+test_live_in_negative:
+	# x2 comes in as parameter, no instruction sets it (should NOT inline)
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	# x2 is live-in, no size-setting instruction
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_live_in_negative, .-test_live_in_negative
+
+	.globl	test_memcpy8_4_byte
+	.type	test_memcpy8_4_byte,@function
+test_memcpy8_4_byte:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #4
+	bl	_memcpy8
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_memcpy8_4_byte, .-test_memcpy8_4_byte
+
+	# Simple _memcpy8 implementation that calls memcpy and returns dest+size
+	.globl	_memcpy8
+	.type	_memcpy8,@function
+_memcpy8:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	mov	x3, x0
+	bl	memcpy
+	add	x0, x3, x2
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_memcpy8, .-_memcpy8
+
+	.globl	complex_operation
+	.type	complex_operation,@function
+complex_operation:
+	stp     x29, x30, [sp, #-32]!
+	str     x19, [sp, #16]
+	mov     x29, sp
+	ldp     x9, x10, [x0]
+	ldp     x11, x12, [x0, #16]
+	mov     x19, x1
+	mov     x8, x0
+	add     x0, x1, #32
+	madd    x9, x9, x2, x3
+	and     x10, x10, x4
+	asr     x12, x12, #2
+	mov     w2, #8
+	orr     x11, x12, x11, lsl #3
+	eor     x12, x9, x10
+	mul     x10, x11, x10
+	eor     x12, x12, x11
+	add     x13, x12, x9
+	add     x9, x11, x9, asr #4
+	stp     x13, x10, [x1]
+	mov     w10, w12
+	stp     x9, x10, [x1, #16]
+	add     x1, x8, #32
+	bl      memcpy
+	ldr     x0, [x19, #16]
+	ldr     x19, [sp, #16]
+	ldp     x29, x30, [sp], #32
+	b       use
+	.size	complex_operation, .-complex_operation
+
+	.globl	use
+	.type	use,@function
+use:
+	ret
+	.size	use, .-use
+
+# Same as above but using FP caller-saved registers (Q16/17)
+	.globl	complex_fp_operation
+	.type	complex_fp_operation,@function
+complex_fp_operation:
+	stp     x29, x30, [sp, #-48]!
+	stp     q8, q9, [sp, #16]
+	mov     x29, sp
+	ldr     q16, [x0]
+	ldr     q17, [x0, #16]
+	mov     x8, x0
+	add     x0, x1, #32
+	fadd    v16.4s, v16.4s, v17.4s
+	fmul    v17.4s, v16.4s, v17.4s
+	fsub    v16.2d, v16.2d, v17.2d
+	mov     w2, #64
+	fmax    v17.4s, v16.4s, v17.4s
+	fmin    v16.2d, v16.2d, v17.2d
+	str     q16, [x1]
+	str     q17, [x1, #16]
+	add     x1, x8, #32
+	bl      memcpy
+	ldp     q8, q9, [sp, #16]
+	ldp     x29, x30, [sp], #48
+	b       use_fp
+	.size	complex_fp_operation, .-complex_fp_operation
+
+	.globl	use_fp
+	.type	use_fp,@function
+use_fp:
+	ret
+	.size	use_fp, .-use_fp