
The patch attempts to optimize a sequence of SIMD loads from the same base pointer: %0 = gep float*, float* base, i32 4 %1 = bitcast float* %0 to <4 x float>* %2 = load <4 x float>, <4 x float>* %1 ... %n1 = gep float*, float* base, i32 N %n2 = bitcast float* %n1 to <4 x float>* %n3 = load <4 x float>, <4 x float>* %n2 For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16]. However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so the address is computed before every ld/st instruction: add r2, r0, #32 add r0, r0, #16 vld1.32 {d18, d19}, [r2] vld1.32 {d22, d23}, [r0] This can be improved by computing address for the first load, and then using a post-indexed form of VLD1/VST1 to load the rest: add r0, r0, #16 vld1.32 {d18, d19}, [r0]! vld1.32 {d22, d23}, [r0] In order to do that, the patch adds more patterns to DAGCombine: - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1 and inc2 are constants. - (or ptr inc) is now recognized as a pointer increment if ptr is sufficiently aligned. In addition to that, we now search for all possible base updates and then pick the best one. Differential Revision: https://reviews.llvm.org/D108988
46 lines
1.7 KiB
LLVM
46 lines
1.7 KiB
LLVM
; RUN: llc -mtriple=thumbv7k-apple-watchos %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V7K
|
|
; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AAPCS
|
|
; RUN: llc -mtriple=thumbv7-apple-ios %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-APCS
|
|
|
|
define <32 x i8> @test_consume_arg([9 x double], <32 x i8> %vec) {
|
|
; CHECK-LABEL: test_consume_arg:
|
|
|
|
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #16
|
|
; CHECK-V7K: vld1.64 {d0, d1}, [r[[BASE]]:128]
|
|
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #32
|
|
; CHECK-V7K: vld1.64 {d2, d3}, [r[[BASE]]:128]
|
|
|
|
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #8
|
|
; CHECK-AAPCS: vld1.64 {d0, d1}, [r[[BASE]]]
|
|
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #24
|
|
; CHECK-AAPCS: vld1.64 {d2, d3}, [r[[BASE]]]
|
|
|
|
; CHECK-APCS: add r[[BASE:[0-9]+]], sp, #76
|
|
; CHECK-APCS: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
|
; CHECK-APCS: add r[[BASE:[0-9]+]], sp, #60
|
|
; CHECK-APCS: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
|
|
|
ret <32 x i8> %vec
|
|
}
|
|
|
|
define void @test_produce_arg() {
|
|
; CHECK-LABEL: test_produce_arg:
|
|
|
|
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #16
|
|
; CHECK-V7K: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]!
|
|
; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
|
|
|
|
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #8
|
|
; CHECK-AAPCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
|
|
; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
|
|
|
; CHECK-APCS: mov r[[R4:[0-9]+]], sp
|
|
; CHECK-APCS: mov r[[BASE:[0-9]+]], sp
|
|
; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #60
|
|
; CHECK-APCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
|
|
; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
|
|
|
|
call <32 x i8> @test_consume_arg([9 x double] undef, <32 x i8> zeroinitializer)
|
|
ret void
|
|
}
|