llvm-project/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
Paul Walker 41a6bb4c05
[LLVM][CodeGen][SVE] Prefer NEON instructions when zeroing Z registers. (#133929)
Several implementations have zero-latency instructions to zero
registers. To-date no implementation has a dedicated SVE instruction but
we can use the NEON equivalent because it is defined to zero bits
128..VL regardless of the immediate used.

NOTE: The relevant instruction is not available in streaming mode, where
the original SVE DUP instruction remains in use.
2025-04-03 13:15:05 +01:00

74 lines
3.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
; Test that we do not end in an infinite loop (https://github.com/llvm/llvm-project/issues/63322)
declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)
define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-LABEL: allocno_reload_assign:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, xzr
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s
; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h
; CHECK-NEXT: uzp1 p8.b, p0.b, p0.b
; CHECK-NEXT: mov z0.b, p8/z, #1 // =0x1
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mvn w8, w8
; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: whilelo p0.b, xzr, x8
; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: punpklo p2.h, p1.b
; CHECK-NEXT: punpkhi p4.h, p1.b
; CHECK-NEXT: punpklo p6.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: punpkhi p2.h, p2.b
; CHECK-NEXT: punpklo p3.h, p4.b
; CHECK-NEXT: punpkhi p4.h, p4.b
; CHECK-NEXT: punpklo p5.h, p6.b
; CHECK-NEXT: punpkhi p6.h, p6.b
; CHECK-NEXT: punpklo p7.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: uunpklo z1.h, z0.b
; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEXT: uunpklo z3.d, z2.s
; CHECK-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEXT: st1b { z3.d }, p1, [z0.d]
; CHECK-NEXT: st1b { z2.d }, p2, [z0.d]
; CHECK-NEXT: uunpklo z2.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: st1b { z2.d }, p3, [z0.d]
; CHECK-NEXT: uunpkhi z2.h, z0.b
; CHECK-NEXT: uunpklo z3.s, z2.h
; CHECK-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEXT: st1b { z1.d }, p4, [z0.d]
; CHECK-NEXT: uunpklo z1.d, z3.s
; CHECK-NEXT: st1b { z1.d }, p5, [z0.d]
; CHECK-NEXT: uunpkhi z1.d, z3.s
; CHECK-NEXT: st1b { z1.d }, p6, [z0.d]
; CHECK-NEXT: uunpklo z1.d, z2.s
; CHECK-NEXT: st1b { z1.d }, p7, [z0.d]
; CHECK-NEXT: uunpkhi z1.d, z2.s
; CHECK-NEXT: st1b { z1.d }, p0, [z0.d]
; CHECK-NEXT: str p8, [x0]
; CHECK-NEXT: b .LBB0_1
br label %1
1: ; preds = %1, %0
%constexpr = icmp eq <vscale x 16 x ptr> insertelement (<vscale x 16 x ptr> poison, ptr null, i64 0), zeroinitializer
%constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
store <vscale x 16 x i1> %constexpr, ptr %p, align 16
br label %1
}
uselistorder <vscale x 16 x i1> poison, { 1, 2, 0 }