
Several implementations have zero-latency instructions to zero registers. To-date no implementation has a dedicated SVE instruction but we can use the NEON equivalent because it is defined to zero bits 128..VL regardless of the immediate used. NOTE: The relevant instruction is not available in streaming mode, where the original SVE DUP instruction remains in use.
74 lines
3.2 KiB
LLVM
74 lines
3.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
|
|
|
|
; Test that we do not end in an infinite loop (https://github.com/llvm/llvm-project/issues/63322)
|
|
|
|
declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x 16 x ptr>, i32 immarg, <vscale x 16 x i1>)
|
|
|
|
define fastcc i8 @allocno_reload_assign(ptr %p) {
|
|
; CHECK-LABEL: allocno_reload_assign:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov d0, xzr
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
|
|
; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s
|
|
; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h
|
|
; CHECK-NEXT: uzp1 p8.b, p0.b, p0.b
|
|
; CHECK-NEXT: mov z0.b, p8/z, #1 // =0x1
|
|
; CHECK-NEXT: fmov w8, s0
|
|
; CHECK-NEXT: movi v0.2d, #0000000000000000
|
|
; CHECK-NEXT: mvn w8, w8
|
|
; CHECK-NEXT: sbfx x8, x8, #0, #1
|
|
; CHECK-NEXT: whilelo p0.b, xzr, x8
|
|
; CHECK-NEXT: punpklo p1.h, p0.b
|
|
; CHECK-NEXT: punpkhi p0.h, p0.b
|
|
; CHECK-NEXT: punpklo p2.h, p1.b
|
|
; CHECK-NEXT: punpkhi p4.h, p1.b
|
|
; CHECK-NEXT: punpklo p6.h, p0.b
|
|
; CHECK-NEXT: punpkhi p0.h, p0.b
|
|
; CHECK-NEXT: punpklo p1.h, p2.b
|
|
; CHECK-NEXT: punpkhi p2.h, p2.b
|
|
; CHECK-NEXT: punpklo p3.h, p4.b
|
|
; CHECK-NEXT: punpkhi p4.h, p4.b
|
|
; CHECK-NEXT: punpklo p5.h, p6.b
|
|
; CHECK-NEXT: punpkhi p6.h, p6.b
|
|
; CHECK-NEXT: punpklo p7.h, p0.b
|
|
; CHECK-NEXT: punpkhi p0.h, p0.b
|
|
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: uunpklo z1.h, z0.b
|
|
; CHECK-NEXT: uunpklo z2.s, z1.h
|
|
; CHECK-NEXT: uunpkhi z1.s, z1.h
|
|
; CHECK-NEXT: uunpklo z3.d, z2.s
|
|
; CHECK-NEXT: uunpkhi z2.d, z2.s
|
|
; CHECK-NEXT: st1b { z3.d }, p1, [z0.d]
|
|
; CHECK-NEXT: st1b { z2.d }, p2, [z0.d]
|
|
; CHECK-NEXT: uunpklo z2.d, z1.s
|
|
; CHECK-NEXT: uunpkhi z1.d, z1.s
|
|
; CHECK-NEXT: st1b { z2.d }, p3, [z0.d]
|
|
; CHECK-NEXT: uunpkhi z2.h, z0.b
|
|
; CHECK-NEXT: uunpklo z3.s, z2.h
|
|
; CHECK-NEXT: uunpkhi z2.s, z2.h
|
|
; CHECK-NEXT: st1b { z1.d }, p4, [z0.d]
|
|
; CHECK-NEXT: uunpklo z1.d, z3.s
|
|
; CHECK-NEXT: st1b { z1.d }, p5, [z0.d]
|
|
; CHECK-NEXT: uunpkhi z1.d, z3.s
|
|
; CHECK-NEXT: st1b { z1.d }, p6, [z0.d]
|
|
; CHECK-NEXT: uunpklo z1.d, z2.s
|
|
; CHECK-NEXT: st1b { z1.d }, p7, [z0.d]
|
|
; CHECK-NEXT: uunpkhi z1.d, z2.s
|
|
; CHECK-NEXT: st1b { z1.d }, p0, [z0.d]
|
|
; CHECK-NEXT: str p8, [x0]
|
|
; CHECK-NEXT: b .LBB0_1
|
|
br label %1
|
|
|
|
1: ; preds = %1, %0
|
|
%constexpr = icmp eq <vscale x 16 x ptr> insertelement (<vscale x 16 x ptr> poison, ptr null, i64 0), zeroinitializer
|
|
%constexpr1 = shufflevector <vscale x 16 x i1> %constexpr, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
|
|
%constexpr2 = xor <vscale x 16 x i1> %constexpr1, shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
|
|
call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> %constexpr2)
|
|
store <vscale x 16 x i1> %constexpr, ptr %p, align 16
|
|
br label %1
|
|
}
|
|
|
|
uselistorder <vscale x 16 x i1> poison, { 1, 2, 0 }
|