
The patch adds patterns to select the EXT_ZZI_CONSTRUCTIVE pseudo instead of the EXT_ZZI destructive instruction for vector_splice. This only works when the two inputs to vector_splice are identical. Given that registers aren't tied anymore, this gives the register allocator more freedom and a lot of MOVs get replaced with MOVPRFX. In some cases however, we could have just chosen the same input and output register, but regalloc preferred not to. This means we end up with some test cases now having more instructions: there is now a MOVPRFX while no MOV was previously needed.
152 lines
5.9 KiB
LLVM
152 lines
5.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mattr=+sve < %s | FileCheck %s
|
|
; RUN: llc -mattr=+dotprod,+sve < %s | FileCheck %s -check-prefix=DOT
|
|
; RUN: llc -mattr=+dotprod,+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=STREAMING-SVE
|
|
; RUN: llc -mattr=+dotprod,+sme -force-streaming < %s | FileCheck %s --check-prefix=STREAMING-SVE
|
|
|
|
target triple = "aarch64-unknown-linux-gnu"
|
|
|
|
define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
|
|
; CHECK-LABEL: reduce_uaddv_v16i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ushll2 v2.8h, v1.16b, #0
|
|
; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0
|
|
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
|
|
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
|
|
; CHECK-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
|
|
; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h
|
|
; CHECK-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
|
|
; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
|
|
; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
|
|
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
|
|
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-NEXT: addv s0, v0.4s
|
|
; CHECK-NEXT: fmov w0, s0
|
|
; CHECK-NEXT: ret
|
|
;
|
|
; DOT-LABEL: reduce_uaddv_v16i8:
|
|
; DOT: // %bb.0:
|
|
; DOT-NEXT: movi v2.16b, #1
|
|
; DOT-NEXT: movi v3.2d, #0000000000000000
|
|
; DOT-NEXT: udot v3.4s, v1.16b, v2.16b
|
|
; DOT-NEXT: udot v3.4s, v0.16b, v2.16b
|
|
; DOT-NEXT: addv s0, v3.4s
|
|
; DOT-NEXT: fmov w0, s0
|
|
; DOT-NEXT: ret
|
|
;
|
|
; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
|
|
; STREAMING-SVE: // %bb.0:
|
|
; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
|
|
; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
|
|
; STREAMING-SVE-NEXT: movprfx z2, z1
|
|
; STREAMING-SVE-NEXT: ext z2.b, z2.b, z1.b, #8
|
|
; STREAMING-SVE-NEXT: movprfx z3, z0
|
|
; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8
|
|
; STREAMING-SVE-NEXT: ptrue p0.s, vl4
|
|
; STREAMING-SVE-NEXT: uunpklo z1.h, z1.b
|
|
; STREAMING-SVE-NEXT: uunpklo z0.h, z0.b
|
|
; STREAMING-SVE-NEXT: uunpklo z2.h, z2.b
|
|
; STREAMING-SVE-NEXT: uunpklo z3.h, z3.b
|
|
; STREAMING-SVE-NEXT: movprfx z4, z1
|
|
; STREAMING-SVE-NEXT: ext z4.b, z4.b, z1.b, #8
|
|
; STREAMING-SVE-NEXT: movprfx z7, z0
|
|
; STREAMING-SVE-NEXT: ext z7.b, z7.b, z0.b, #8
|
|
; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h
|
|
; STREAMING-SVE-NEXT: uunpklo z0.s, z0.h
|
|
; STREAMING-SVE-NEXT: movprfx z5, z2
|
|
; STREAMING-SVE-NEXT: ext z5.b, z5.b, z2.b, #8
|
|
; STREAMING-SVE-NEXT: movprfx z6, z3
|
|
; STREAMING-SVE-NEXT: ext z6.b, z6.b, z3.b, #8
|
|
; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h
|
|
; STREAMING-SVE-NEXT: uunpklo z4.s, z4.h
|
|
; STREAMING-SVE-NEXT: uunpklo z7.s, z7.h
|
|
; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h
|
|
; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h
|
|
; STREAMING-SVE-NEXT: uunpklo z6.s, z6.h
|
|
; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s
|
|
; STREAMING-SVE-NEXT: add z1.s, z3.s, z2.s
|
|
; STREAMING-SVE-NEXT: add z2.s, z7.s, z4.s
|
|
; STREAMING-SVE-NEXT: add z3.s, z6.s, z5.s
|
|
; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s
|
|
; STREAMING-SVE-NEXT: add z1.s, z2.s, z3.s
|
|
; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s
|
|
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
|
|
; STREAMING-SVE-NEXT: fmov w0, s0
|
|
; STREAMING-SVE-NEXT: ret
|
|
%1 = zext <32 x i8> %a to <32 x i32>
|
|
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
|
|
ret i32 %2
|
|
}
|
|
|
|
define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
|
|
; CHECK-LABEL: reduce_saddv_v16i8:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: sshll2 v2.8h, v1.16b, #0
|
|
; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0
|
|
; CHECK-NEXT: sshll v1.8h, v1.8b, #0
|
|
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
|
|
; CHECK-NEXT: saddl2 v4.4s, v3.8h, v2.8h
|
|
; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h
|
|
; CHECK-NEXT: saddl2 v5.4s, v0.8h, v1.8h
|
|
; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
|
|
; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
|
|
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
|
|
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-NEXT: addv s0, v0.4s
|
|
; CHECK-NEXT: fmov w0, s0
|
|
; CHECK-NEXT: ret
|
|
;
|
|
; DOT-LABEL: reduce_saddv_v16i8:
|
|
; DOT: // %bb.0:
|
|
; DOT-NEXT: movi v2.16b, #1
|
|
; DOT-NEXT: movi v3.2d, #0000000000000000
|
|
; DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
|
|
; DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
|
|
; DOT-NEXT: addv s0, v3.4s
|
|
; DOT-NEXT: fmov w0, s0
|
|
; DOT-NEXT: ret
|
|
;
|
|
; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
|
|
; STREAMING-SVE: // %bb.0:
|
|
; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
|
|
; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
|
|
; STREAMING-SVE-NEXT: movprfx z2, z1
|
|
; STREAMING-SVE-NEXT: ext z2.b, z2.b, z1.b, #8
|
|
; STREAMING-SVE-NEXT: movprfx z3, z0
|
|
; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8
|
|
; STREAMING-SVE-NEXT: ptrue p0.s, vl4
|
|
; STREAMING-SVE-NEXT: sunpklo z1.h, z1.b
|
|
; STREAMING-SVE-NEXT: sunpklo z0.h, z0.b
|
|
; STREAMING-SVE-NEXT: sunpklo z2.h, z2.b
|
|
; STREAMING-SVE-NEXT: sunpklo z3.h, z3.b
|
|
; STREAMING-SVE-NEXT: movprfx z4, z1
|
|
; STREAMING-SVE-NEXT: ext z4.b, z4.b, z1.b, #8
|
|
; STREAMING-SVE-NEXT: movprfx z7, z0
|
|
; STREAMING-SVE-NEXT: ext z7.b, z7.b, z0.b, #8
|
|
; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h
|
|
; STREAMING-SVE-NEXT: sunpklo z0.s, z0.h
|
|
; STREAMING-SVE-NEXT: movprfx z5, z2
|
|
; STREAMING-SVE-NEXT: ext z5.b, z5.b, z2.b, #8
|
|
; STREAMING-SVE-NEXT: movprfx z6, z3
|
|
; STREAMING-SVE-NEXT: ext z6.b, z6.b, z3.b, #8
|
|
; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h
|
|
; STREAMING-SVE-NEXT: sunpklo z4.s, z4.h
|
|
; STREAMING-SVE-NEXT: sunpklo z7.s, z7.h
|
|
; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h
|
|
; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h
|
|
; STREAMING-SVE-NEXT: sunpklo z6.s, z6.h
|
|
; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s
|
|
; STREAMING-SVE-NEXT: add z1.s, z3.s, z2.s
|
|
; STREAMING-SVE-NEXT: add z2.s, z7.s, z4.s
|
|
; STREAMING-SVE-NEXT: add z3.s, z6.s, z5.s
|
|
; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s
|
|
; STREAMING-SVE-NEXT: add z1.s, z2.s, z3.s
|
|
; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s
|
|
; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
|
|
; STREAMING-SVE-NEXT: fmov w0, s0
|
|
; STREAMING-SVE-NEXT: ret
|
|
%1 = sext <32 x i8> %a to <32 x i32>
|
|
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
|
|
ret i32 %2
|
|
}
|