
The patch adds patterns to select the EXT_ZZI_CONSTRUCTIVE pseudo instead of the EXT_ZZI destructive instruction for vector_splice. This only works when the two inputs to vector_splice are identical. Given that registers aren't tied anymore, this gives the register allocator more freedom and a lot of MOVs get replaced with MOVPRFX. In some cases however, we could have just chosen the same input and output register, but regalloc preferred not to. This means we end up with some test cases now having more instructions: there is now a MOVPRFX while no MOV was previously needed.
53 lines
1.9 KiB
LLVM
53 lines
1.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mattr=+sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
|
|
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
|
target triple = "aarch64-unknown-linux-gnu"
|
|
|
|
define <4 x i32> @test(ptr %arg1, ptr %arg2) {
|
|
; CHECK-LABEL: test:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: ptrue p0.s, vl8
|
|
; CHECK-NEXT: mov x8, #8 // =0x8
|
|
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
|
|
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
|
|
; CHECK-NEXT: movprfx z0, z1
|
|
; CHECK-NEXT: ext z0.b, z0.b, z1.b, #16
|
|
; CHECK-NEXT: add z1.s, z1.s, z1.s
|
|
; CHECK-NEXT: add z2.s, z2.s, z2.s
|
|
; CHECK-NEXT: dup v0.4s, v0.s[2]
|
|
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
|
|
; CHECK-NEXT: st1w { z2.s }, p0, [x0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = load <16 x i32>, ptr %arg1, align 256
|
|
%1 = load <16 x i32>, ptr %arg2, align 256
|
|
%shvec = shufflevector <16 x i32> %0, <16 x i32> %1, <4 x i32> <i32 14, i32 14, i32 14, i32 14>
|
|
%2 = add <16 x i32> %0, %0
|
|
store <16 x i32> %2, ptr %arg1, align 256
|
|
ret <4 x i32> %shvec
|
|
}
|
|
|
|
define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
|
|
; CHECK-LABEL: test2:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: ptrue p0.s, vl8
|
|
; CHECK-NEXT: mov x8, #8 // =0x8
|
|
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
|
|
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
|
|
; CHECK-NEXT: movprfx z0, z1
|
|
; CHECK-NEXT: ext z0.b, z0.b, z1.b, #24
|
|
; CHECK-NEXT: add z1.s, z1.s, z1.s
|
|
; CHECK-NEXT: add z2.s, z2.s, z2.s
|
|
; CHECK-NEXT: dup v0.2s, v0.s[0]
|
|
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
|
|
; CHECK-NEXT: st1w { z2.s }, p0, [x0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = load <16 x i32>, ptr %arg1, align 256
|
|
%1 = load <16 x i32>, ptr %arg2, align 256
|
|
%shvec = shufflevector <16 x i32> %0, <16 x i32> %1, <2 x i32> <i32 14, i32 14>
|
|
%2 = add <16 x i32> %0, %0
|
|
store <16 x i32> %2, ptr %arg1, align 256
|
|
ret <2 x i32> %shvec
|
|
}
|