
Re-landing #116970 after fixing miscompilation error. The original change made it possible for CMPZ to have multiple uses; `ARMDAGToDAGISel::SelectCMPZ` was not prepared for this. Pull Request: https://github.com/llvm/llvm-project/pull/118887 Original commit message: Following #116547 and #116676, this PR changes the type of results and operands of some nodes to accept / return a normal type instead of Glue. Unfortunately, changing the result type of one node requires changing the operand types of all potential consumer nodes, which in turn requires changing the result types of all other possible producer nodes. So this is a bulk change.
1564 lines
71 KiB
LLVM
1564 lines
71 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
define arm_aapcs_vfpcc void @test_fadd(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fadd:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB0_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
; CHECK-NEXT: .LBB0_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #8
|
|
; CHECK-NEXT: vadd.f16 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB0_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fadd fast <8 x half> %wide.load, %broadcast.splat11
|
|
%i4 = getelementptr inbounds half, ptr %C, i32 %index
|
|
store <8 x half> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fadd_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fadd_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB1_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
; CHECK-NEXT: .LBB1_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #8
|
|
; CHECK-NEXT: vadd.f16 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB1_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fadd fast <8 x half> %broadcast.splat11, %wide.load
|
|
%i4 = getelementptr inbounds half, ptr %C, i32 %index
|
|
store <8 x half> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmul(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fmul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB2_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
; CHECK-NEXT: .LBB2_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #8
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB2_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fmul fast <8 x half> %wide.load, %broadcast.splat11
|
|
%i4 = getelementptr inbounds half, ptr %C, i32 %index
|
|
store <8 x half> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmul_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fmul_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB3_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
; CHECK-NEXT: .LBB3_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #8
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB3_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fmul fast <8 x half> %broadcast.splat11, %wide.load
|
|
%i4 = getelementptr inbounds half, ptr %C, i32 %index
|
|
store <8 x half> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fsub(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fsub:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB4_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
; CHECK-NEXT: .LBB4_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r2, #8
|
|
; CHECK-NEXT: vsub.f16 q0, q0, r3
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
|
; CHECK-NEXT: bne .LBB4_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fsub fast <8 x half> %wide.load, %broadcast.splat11
|
|
%i4 = getelementptr inbounds half, ptr %C, i32 %index
|
|
store <8 x half> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fsub_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fsub_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r2, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB5_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
; CHECK-NEXT: vdup.16 q0, r3
|
|
; CHECK-NEXT: .LBB5_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r2, #8
|
|
; CHECK-NEXT: vsub.f16 q1, q0, q1
|
|
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
|
; CHECK-NEXT: bne .LBB5_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fsub fast <8 x half> %broadcast.splat11, %wide.load
|
|
%i4 = getelementptr inbounds half, ptr %C, i32 %index
|
|
store <8 x half> %i3, ptr %i4, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i6 = icmp eq i32 %index.next, %n
|
|
br i1 %i6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc void @test_fmas(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmas:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB6_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: .LBB6_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vfmas.f16 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB6_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load12 = load <8 x half>, ptr %i3, align 4
|
|
%i5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%i6 = fadd fast <8 x half> %i5, %broadcast.splat14
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmas_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmas_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB7_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: .LBB7_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vfmas.f16 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB7_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load12 = load <8 x half>, ptr %i3, align 4
|
|
%i5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%i6 = fadd fast <8 x half> %broadcast.splat14, %i5
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fma(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fma:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB8_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: .LBB8_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vfma.f16 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB8_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
|
|
%i4 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load14 = load <8 x half>, ptr %i4, align 4
|
|
%i6 = fadd fast <8 x half> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fma_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fma_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB9_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: .LBB9_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vfma.f16 q1, q0, r12
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB9_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
|
|
%i4 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load14 = load <8 x half>, ptr %i4, align 4
|
|
%i6 = fadd fast <8 x half> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc void @test_fmss(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmss:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB10_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: vdup.16 q0, r12
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
|
; CHECK-NEXT: .LBB10_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vfma.f16 q3, q2, q1
|
|
; CHECK-NEXT: vstrb.8 q3, [r2], #16
|
|
; CHECK-NEXT: bne .LBB10_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load12 = load <8 x half>, ptr %i3, align 4
|
|
%i5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%i6 = fsub fast <8 x half> %i5, %broadcast.splat14
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmss_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmss_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB11_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: vdup.16 q0, r12
|
|
; CHECK-NEXT: .LBB11_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vfms.f16 q3, q2, q1
|
|
; CHECK-NEXT: vstrb.8 q3, [r2], #16
|
|
; CHECK-NEXT: bne .LBB11_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load12 = load <8 x half>, ptr %i3, align 4
|
|
%i5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%i6 = fsub fast <8 x half> %broadcast.splat14, %i5
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fms(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fms:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB12_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: .LBB12_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB12_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
|
|
%i4 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load14 = load <8 x half>, ptr %i4, align 4
|
|
%i6 = fsub fast <8 x half> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fms_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fms_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: .LBB13_1: @ %vector.ph
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
; CHECK-NEXT: .LBB13_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB13_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = and i32 %n, 7
|
|
%cmp = icmp eq i32 %i, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%i1 = getelementptr inbounds half, ptr %A, i32 %index
|
|
%wide.load = load <8 x half>, ptr %i1, align 4
|
|
%i3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
|
|
%i4 = getelementptr inbounds half, ptr %B, i32 %index
|
|
%wide.load14 = load <8 x half>, ptr %i4, align 4
|
|
%i6 = fsub fast <8 x half> %i3, %wide.load14
|
|
%i7 = getelementptr inbounds half, ptr %D, i32 %index
|
|
store <8 x half> %i6, ptr %i7, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i9 = icmp eq i32 %index.next, %n
|
|
br i1 %i9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias nocapture readonly %pOutT1, ptr noalias nocapture readonly %pPRT_in, ptr noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr {
|
|
; CHECK-LABEL: test_nested:
|
|
; CHECK: @ %bb.0: @ %for.body.us.preheader
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
|
; CHECK-NEXT: ldrd lr, r12, [sp, #16]
|
|
; CHECK-NEXT: lsl.w r3, r12, #1
|
|
; CHECK-NEXT: .LBB14_1: @ %for.body.us
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
|
|
; CHECK-NEXT: ldrh r4, [r1]
|
|
; CHECK-NEXT: mov r5, r2
|
|
; CHECK-NEXT: mov r6, r12
|
|
; CHECK-NEXT: vdup.16 q0, r4
|
|
; CHECK-NEXT: mov r4, r0
|
|
; CHECK-NEXT: .LBB14_2: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r4]
|
|
; CHECK-NEXT: subs r6, #8
|
|
; CHECK-NEXT: vfms.f16 q2, q1, q0
|
|
; CHECK-NEXT: vstrb.8 q2, [r4], #16
|
|
; CHECK-NEXT: bne .LBB14_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
|
|
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
|
|
; CHECK-NEXT: add r0, r3
|
|
; CHECK-NEXT: add r2, r3
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: le lr, .LBB14_1
|
|
; CHECK-NEXT: @ %bb.4: @ %for.end14
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
|
for.body.us.preheader:
|
|
%cmp = icmp sgt i32 %numRows, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp1 = icmp sgt i32 %numCols, 0
|
|
tail call void @llvm.assume(i1 %cmp1)
|
|
%rem = and i32 %numCols, 7
|
|
%cmp2 = icmp eq i32 %rem, 0
|
|
tail call void @llvm.assume(i1 %cmp2)
|
|
%cmp3 = icmp slt i32 %l, %numCols
|
|
tail call void @llvm.assume(i1 %cmp3)
|
|
br label %for.body.us
|
|
|
|
for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
|
|
%pInT1.addr.038.us = phi ptr [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
|
|
%i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
|
|
%pOutT1.addr.036.us = phi ptr [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
|
|
%pPRT_in.addr.035.us = phi ptr [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
|
|
%scevgep = getelementptr half, ptr %pPRT_in.addr.035.us, i32 %numCols
|
|
%i = load half, ptr %pOutT1.addr.036.us, align 4
|
|
%broadcast.splatinsert47 = insertelement <8 x half> undef, half %i, i32 0
|
|
%broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body.us
|
|
%index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
|
|
%next.gep = getelementptr half, ptr %pInT1.addr.038.us, i32 %index
|
|
%next.gep45 = getelementptr half, ptr %pPRT_in.addr.035.us, i32 %index
|
|
%wide.load = load <8 x half>, ptr %next.gep, align 4
|
|
%wide.load46 = load <8 x half>, ptr %next.gep45, align 4
|
|
%i3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48
|
|
%i4 = fsub fast <8 x half> %wide.load, %i3
|
|
store <8 x half> %i4, ptr %next.gep, align 4
|
|
%index.next = add i32 %index, 8
|
|
%i5 = icmp eq i32 %index.next, %numCols
|
|
br i1 %i5, label %for.cond6.for.end_crit_edge.us, label %vector.body
|
|
|
|
for.cond6.for.end_crit_edge.us: ; preds = %vector.body
|
|
%incdec.ptr.us = getelementptr inbounds half, ptr %pOutT1.addr.036.us, i32 1
|
|
%scevgep40 = getelementptr half, ptr %pInT1.addr.038.us, i32 %numCols
|
|
%inc13.us = add nuw nsw i32 %i.037.us, 1
|
|
%exitcond41 = icmp eq i32 %inc13.us, %numRows
|
|
br i1 %exitcond41, label %for.end14, label %for.body.us
|
|
|
|
for.end14: ; preds = %for.cond6.for.end_crit_edge.us
|
|
ret void
|
|
}
|
|
|
|
%struct.arm_fir_instance_f32 = type { i16, ptr, ptr }
|
|
define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: arm_fir_f32_1_4_mve:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #16
|
|
; CHECK-NEXT: sub sp, #16
|
|
; CHECK-NEXT: ldrh.w r9, [r0]
|
|
; CHECK-NEXT: ldr.w r10, [r0, #4]
|
|
; CHECK-NEXT: sub.w r6, r9, #1
|
|
; CHECK-NEXT: cmp r6, #3
|
|
; CHECK-NEXT: bhi .LBB15_6
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
|
; CHECK-NEXT: ldr r7, [r0, #8]
|
|
; CHECK-NEXT: add.w r4, r10, r6, lsl #1
|
|
; CHECK-NEXT: lsrs r5, r3, #2
|
|
; CHECK-NEXT: ldrh.w r8, [r7, #6]
|
|
; CHECK-NEXT: ldrh.w r12, [r7, #4]
|
|
; CHECK-NEXT: ldrh r6, [r7, #2]
|
|
; CHECK-NEXT: ldrh r7, [r7]
|
|
; CHECK-NEXT: wls lr, r5, .LBB15_5
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
|
; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: bic r5, r3, #3
|
|
; CHECK-NEXT: add.w r9, r10, #2
|
|
; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r5, r2, r5, lsl #1
|
|
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB15_3: @ %while.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #8
|
|
; CHECK-NEXT: sub.w r11, r9, #2
|
|
; CHECK-NEXT: add.w r5, r9, #2
|
|
; CHECK-NEXT: vstrb.8 q0, [r4], #8
|
|
; CHECK-NEXT: vldrw.u32 q0, [r11]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r9]
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r7
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
|
; CHECK-NEXT: vldrw.u32 q1, [r9, #4]
|
|
; CHECK-NEXT: add.w r9, r9, #8
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #8
|
|
; CHECK-NEXT: le lr, .LBB15_3
|
|
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
|
|
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r10, r10, r2, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
|
|
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: .LBB15_5: @ %while.end
|
|
; CHECK-NEXT: and r5, r3, #3
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
|
; CHECK-NEXT: vctp.16 r5
|
|
; CHECK-NEXT: add.w r1, r10, #2
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrht.16 q0, [r4]
|
|
; CHECK-NEXT: vldrw.u32 q0, [r10]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
|
; CHECK-NEXT: add.w r1, r10, #6
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r7
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r10, #4]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrht.16 q0, [r2]
|
|
; CHECK-NEXT: ldr.w r10, [r0, #4]
|
|
; CHECK-NEXT: .LBB15_6: @ %if.end
|
|
; CHECK-NEXT: add.w r0, r10, r3, lsl #1
|
|
; CHECK-NEXT: lsr.w r1, r9, #2
|
|
; CHECK-NEXT: wls lr, r1, .LBB15_10
|
|
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
|
|
; CHECK-NEXT: bic r2, r9, #3
|
|
; CHECK-NEXT: adds r1, r2, r3
|
|
; CHECK-NEXT: mov r3, r10
|
|
; CHECK-NEXT: add.w r1, r10, r1, lsl #1
|
|
; CHECK-NEXT: .LBB15_8: @ %while.body51
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #8
|
|
; CHECK-NEXT: le lr, .LBB15_8
|
|
; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
|
|
; CHECK-NEXT: add.w r10, r10, r2, lsl #1
|
|
; CHECK-NEXT: mov r0, r1
|
|
; CHECK-NEXT: .LBB15_10: @ %while.end55
|
|
; CHECK-NEXT: ands r1, r9, #3
|
|
; CHECK-NEXT: beq .LBB15_12
|
|
; CHECK-NEXT: @ %bb.11: @ %if.then59
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
|
; CHECK-NEXT: vctp.16 r1
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrht.16 q0, [r10]
|
|
; CHECK-NEXT: .LBB15_12: @ %if.end61
|
|
; CHECK-NEXT: add sp, #16
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
|
|
%i1 = load ptr, ptr %pCoeffs2, align 4
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
|
|
%i2 = load i16, ptr %numTaps3, align 4
|
|
%conv = zext i16 %i2 to i32
|
|
%sub = add nsw i32 %conv, -1
|
|
%cmp = icmp ult i32 %sub, 4
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then: ; preds = %entry
|
|
%arrayidx = getelementptr inbounds half, ptr %i, i32 %sub
|
|
%incdec.ptr = getelementptr inbounds half, ptr %i1, i32 1
|
|
%i3 = load half, ptr %i1, align 4
|
|
%incdec.ptr6 = getelementptr inbounds half, ptr %i1, i32 2
|
|
%i4 = load half, ptr %incdec.ptr, align 4
|
|
%incdec.ptr7 = getelementptr inbounds half, ptr %i1, i32 3
|
|
%i5 = load half, ptr %incdec.ptr6, align 4
|
|
%i6 = load half, ptr %incdec.ptr7, align 4
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp9146 = icmp eq i32 %shr, 0
|
|
%.pre161 = insertelement <8 x half> undef, half %i3, i32 0
|
|
%.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%.pre163 = insertelement <8 x half> undef, half %i4, i32 0
|
|
%.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%.pre165 = insertelement <8 x half> undef, half %i5, i32 0
|
|
%.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%.pre167 = insertelement <8 x half> undef, half %i6, i32 0
|
|
%.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br i1 %cmp9146, label %while.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
%i7 = and i32 %blockSize, -4
|
|
%scevgep158 = getelementptr half, ptr %pDst, i32 %i7
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body, %while.body.lr.ph
|
|
%pStateCur.0151 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
|
|
%pSamples.0150 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
|
|
%pOutput.0149 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
|
|
%pTempSrc.0148 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
|
|
%blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
|
|
%i9 = load <8 x half>, ptr %pTempSrc.0148, align 4
|
|
store <8 x half> %i9, ptr %pStateCur.0151, align 4
|
|
%add.ptr = getelementptr inbounds half, ptr %pStateCur.0151, i32 4
|
|
%add.ptr11 = getelementptr inbounds half, ptr %pTempSrc.0148, i32 4
|
|
%i12 = load <8 x half>, ptr %pSamples.0150, align 4
|
|
%i13 = fmul fast <8 x half> %i12, %.pre162
|
|
%arrayidx12 = getelementptr inbounds half, ptr %pSamples.0150, i32 1
|
|
%i15 = load <8 x half>, ptr %arrayidx12, align 4
|
|
%mul = fmul fast <8 x half> %i15, %.pre164
|
|
%add = fadd fast <8 x half> %mul, %i13
|
|
%arrayidx13 = getelementptr inbounds half, ptr %pSamples.0150, i32 2
|
|
%i17 = load <8 x half>, ptr %arrayidx13, align 4
|
|
%mul16 = fmul fast <8 x half> %i17, %.pre166
|
|
%add17 = fadd fast <8 x half> %add, %mul16
|
|
%arrayidx18 = getelementptr inbounds half, ptr %pSamples.0150, i32 3
|
|
%i19 = load <8 x half>, ptr %arrayidx18, align 4
|
|
%mul21 = fmul fast <8 x half> %i19, %.pre168
|
|
%add22 = fadd fast <8 x half> %add17, %mul21
|
|
store <8 x half> %add22, ptr %pOutput.0149, align 4
|
|
%add.ptr23 = getelementptr inbounds half, ptr %pOutput.0149, i32 4
|
|
%add.ptr24 = getelementptr inbounds half, ptr %pSamples.0150, i32 4
|
|
%dec = add nsw i32 %blkCnt.0147, -1
|
|
%cmp9 = icmp eq i32 %dec, 0
|
|
br i1 %cmp9, label %while.end.loopexit, label %while.body
|
|
|
|
while.end.loopexit: ; preds = %while.body
|
|
%scevgep157 = getelementptr half, ptr %pSrc, i32 %i7
|
|
%scevgep159 = getelementptr half, ptr %i, i32 %i7
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %while.end.loopexit, %if.then
|
|
%pTempSrc.0.lcssa = phi ptr [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
|
|
%pOutput.0.lcssa = phi ptr [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
|
|
%pSamples.0.lcssa = phi ptr [ %scevgep159, %while.end.loopexit ], [ %i, %if.then ]
|
|
%pStateCur.0.lcssa = phi ptr [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
|
|
%and = and i32 %blockSize, 3
|
|
%i21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and)
|
|
%i23 = load <8 x half>, ptr %pTempSrc.0.lcssa, align 4
|
|
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %i23, ptr %pStateCur.0.lcssa, i32 4, <8 x i1> %i21)
|
|
%i26 = load <8 x half>, ptr %pSamples.0.lcssa, align 4
|
|
%i27 = fmul fast <8 x half> %i26, %.pre162
|
|
%arrayidx29 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 1
|
|
%i29 = load <8 x half>, ptr %arrayidx29, align 4
|
|
%mul32 = fmul fast <8 x half> %i29, %.pre164
|
|
%add33 = fadd fast <8 x half> %mul32, %i27
|
|
%arrayidx34 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 2
|
|
%i31 = load <8 x half>, ptr %arrayidx34, align 4
|
|
%mul37 = fmul fast <8 x half> %i31, %.pre166
|
|
%add38 = fadd fast <8 x half> %add33, %mul37
|
|
%arrayidx39 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 3
|
|
%i33 = load <8 x half>, ptr %arrayidx39, align 4
|
|
%mul42 = fmul fast <8 x half> %i33, %.pre168
|
|
%add43 = fadd fast <8 x half> %add38, %mul42
|
|
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %add43, ptr %pOutput.0.lcssa, i32 4, <8 x i1> %i21)
|
|
%.pre = load ptr, ptr %pState1, align 4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %while.end, %entry
|
|
%i35 = phi ptr [ %.pre, %while.end ], [ %i, %entry ]
|
|
%arrayidx45 = getelementptr inbounds half, ptr %i35, i32 %blockSize
|
|
%shr47 = lshr i32 %conv, 2
|
|
%cmp49141 = icmp eq i32 %shr47, 0
|
|
br i1 %cmp49141, label %while.end55, label %while.body51.preheader
|
|
|
|
while.body51.preheader: ; preds = %if.end
|
|
%i36 = and i32 %conv, 65532
|
|
%i37 = add i32 %i36, %blockSize
|
|
%scevgep = getelementptr half, ptr %i35, i32 %i37
|
|
br label %while.body51
|
|
|
|
while.body51: ; preds = %while.body51, %while.body51.preheader
|
|
%pTempSrc.1144 = phi ptr [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
|
|
%pTempDest.0143 = phi ptr [ %add.ptr53, %while.body51 ], [ %i35, %while.body51.preheader ]
|
|
%blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
|
|
%i39 = load <8 x half>, ptr %pTempSrc.1144, align 4
|
|
store <8 x half> %i39, ptr %pTempDest.0143, align 4
|
|
%add.ptr52 = getelementptr inbounds half, ptr %pTempSrc.1144, i32 4
|
|
%add.ptr53 = getelementptr inbounds half, ptr %pTempDest.0143, i32 4
|
|
%dec54 = add nsw i32 %blkCnt.1142, -1
|
|
%cmp49 = icmp eq i32 %dec54, 0
|
|
br i1 %cmp49, label %while.end55.loopexit, label %while.body51
|
|
|
|
while.end55.loopexit: ; preds = %while.body51
|
|
%scevgep156 = getelementptr half, ptr %i35, i32 %i36
|
|
br label %while.end55
|
|
|
|
while.end55: ; preds = %while.end55.loopexit, %if.end
|
|
%pTempDest.0.lcssa = phi ptr [ %i35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
|
|
%pTempSrc.1.lcssa = phi ptr [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
|
|
%and56 = and i32 %conv, 3
|
|
%cmp57 = icmp eq i32 %and56, 0
|
|
br i1 %cmp57, label %if.end61, label %if.then59
|
|
|
|
if.then59: ; preds = %while.end55
|
|
%i41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56)
|
|
%i43 = load <8 x half>, ptr %pTempSrc.1.lcssa, align 4
|
|
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %i43, ptr %pTempDest.0.lcssa, i32 4, <8 x i1> %i41)
|
|
br label %if.end61
|
|
|
|
if.end61: ; preds = %if.then59, %while.end55
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: fir:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #24
|
|
; CHECK-NEXT: sub sp, #24
|
|
; CHECK-NEXT: cmp r3, #8
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: blo.w .LBB16_12
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
|
; CHECK-NEXT: lsrs.w r12, r3, #2
|
|
; CHECK-NEXT: beq.w .LBB16_12
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
|
; CHECK-NEXT: ldrh r4, [r0]
|
|
; CHECK-NEXT: movs r1, #1
|
|
; CHECK-NEXT: ldrd r5, r3, [r0, #4]
|
|
; CHECK-NEXT: sub.w r0, r4, #8
|
|
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
|
|
; CHECK-NEXT: and r0, r0, #7
|
|
; CHECK-NEXT: asrs r6, r7, #3
|
|
; CHECK-NEXT: cmp r6, #1
|
|
; CHECK-NEXT: it gt
|
|
; CHECK-NEXT: asrgt r1, r7, #3
|
|
; CHECK-NEXT: add.w r7, r5, r4, lsl #1
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: subs r1, r7, #2
|
|
; CHECK-NEXT: rsbs r7, r4, #0
|
|
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r7, r3, #16
|
|
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: b .LBB16_6
|
|
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r5, r5, r0, lsl #1
|
|
; CHECK-NEXT: b .LBB16_5
|
|
; CHECK-NEXT: .LBB16_4: @ %for.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: wls lr, r0, .LBB16_5
|
|
; CHECK-NEXT: b .LBB16_10
|
|
; CHECK-NEXT: .LBB16_5: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: subs.w r12, r12, #1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #8
|
|
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
|
|
; CHECK-NEXT: add.w r5, r0, #8
|
|
; CHECK-NEXT: beq.w .LBB16_12
|
|
; CHECK-NEXT: .LBB16_6: @ %while.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
|
|
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: ldrh.w lr, [r3, #14]
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
|
|
; CHECK-NEXT: ldrh.w r8, [r3, #12]
|
|
; CHECK-NEXT: ldrh r7, [r3, #10]
|
|
; CHECK-NEXT: ldrh r4, [r3, #8]
|
|
; CHECK-NEXT: ldrh r6, [r3, #6]
|
|
; CHECK-NEXT: ldrh.w r9, [r3, #4]
|
|
; CHECK-NEXT: ldrh.w r11, [r3, #2]
|
|
; CHECK-NEXT: ldrh.w r10, [r3]
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #8
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5]
|
|
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r0, r5, #2
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r10
|
|
; CHECK-NEXT: adds r0, r5, #6
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r11
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r9
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: add.w r0, r5, #10
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: add.w r0, r5, #14
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r7
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
|
|
; CHECK-NEXT: adds r5, #16
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: vfma.f16 q0, q1, lr
|
|
; CHECK-NEXT: cmp r0, #16
|
|
; CHECK-NEXT: blo .LBB16_9
|
|
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: dls lr, r0
|
|
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: .LBB16_8: @ %for.body
|
|
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldrh r0, [r6], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5]
|
|
; CHECK-NEXT: adds r4, r5, #2
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
|
; CHECK-NEXT: ldrh r0, [r6, #-14]
|
|
; CHECK-NEXT: adds r4, r5, #6
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: ldrh r0, [r6, #-12]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
|
; CHECK-NEXT: ldrh r0, [r6, #-10]
|
|
; CHECK-NEXT: add.w r4, r5, #10
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: ldrh r0, [r6, #-8]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
|
; CHECK-NEXT: ldrh r0, [r6, #-6]
|
|
; CHECK-NEXT: ldrh r4, [r6, #-2]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: ldrh r0, [r6, #-4]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: add.w r0, r5, #14
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: adds r5, #16
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
|
; CHECK-NEXT: le lr, .LBB16_8
|
|
; CHECK-NEXT: b .LBB16_4
|
|
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: b .LBB16_4
|
|
; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
|
|
; CHECK-NEXT: mov r0, r5
|
|
; CHECK-NEXT: .LBB16_11: @ %while.body76
|
|
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldrh r4, [r6], #2
|
|
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
|
; CHECK-NEXT: le lr, .LBB16_11
|
|
; CHECK-NEXT: b .LBB16_3
|
|
; CHECK-NEXT: .LBB16_12: @ %if.end
|
|
; CHECK-NEXT: add sp, #24
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
|
|
%i1 = load ptr, ptr %pCoeffs2, align 4
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
|
|
%i2 = load i16, ptr %numTaps3, align 4
|
|
%conv = zext i16 %i2 to i32
|
|
%cmp = icmp ugt i32 %blockSize, 7
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then: ; preds = %entry
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp5217 = icmp eq i32 %shr, 0
|
|
br i1 %cmp5217, label %if.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
%sub = add nsw i32 %conv, -1
|
|
%arrayidx = getelementptr inbounds half, ptr %i, i32 %sub
|
|
%incdec.ptr = getelementptr inbounds half, ptr %i1, i32 1
|
|
%incdec.ptr7 = getelementptr inbounds half, ptr %i1, i32 2
|
|
%incdec.ptr8 = getelementptr inbounds half, ptr %i1, i32 3
|
|
%incdec.ptr9 = getelementptr inbounds half, ptr %i1, i32 4
|
|
%incdec.ptr10 = getelementptr inbounds half, ptr %i1, i32 5
|
|
%incdec.ptr11 = getelementptr inbounds half, ptr %i1, i32 6
|
|
%incdec.ptr12 = getelementptr inbounds half, ptr %i1, i32 7
|
|
%sub37 = add nsw i32 %conv, -8
|
|
%div = sdiv i32 %sub37, 8
|
|
%pCoeffsCur.0199 = getelementptr inbounds half, ptr %i1, i32 8
|
|
%cmp38201 = icmp ugt i16 %i2, 15
|
|
%and = and i32 %sub37, 7
|
|
%cmp74210 = icmp eq i32 %and, 0
|
|
%idx.neg = sub nsw i32 0, %conv
|
|
%i3 = icmp sgt i32 %div, 1
|
|
%smax = select i1 %i3, i32 %div, i32 1
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.end, %while.body.lr.ph
|
|
%blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
|
|
%pStateCur.0221 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
|
|
%pSamples.0220 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
|
|
%pTempSrc.0219 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
|
|
%pOutput.0218 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
|
|
%i4 = load half, ptr %i1, align 4
|
|
%i5 = load half, ptr %incdec.ptr, align 4
|
|
%i6 = load half, ptr %incdec.ptr7, align 4
|
|
%i7 = load half, ptr %incdec.ptr8, align 4
|
|
%i8 = load half, ptr %incdec.ptr9, align 4
|
|
%i9 = load half, ptr %incdec.ptr10, align 4
|
|
%i10 = load half, ptr %incdec.ptr11, align 4
|
|
%i11 = load half, ptr %incdec.ptr12, align 4
|
|
%i13 = load <8 x half>, ptr %pTempSrc.0219, align 4
|
|
store <8 x half> %i13, ptr %pStateCur.0221, align 4
|
|
%add.ptr = getelementptr inbounds half, ptr %pStateCur.0221, i32 4
|
|
%add.ptr14 = getelementptr inbounds half, ptr %pTempSrc.0219, i32 4
|
|
%i16 = load <8 x half>, ptr %pSamples.0220, align 4
|
|
%.splatinsert = insertelement <8 x half> undef, half %i4, i32 0
|
|
%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i17 = fmul fast <8 x half> %i16, %.splat
|
|
%arrayidx15 = getelementptr inbounds half, ptr %pSamples.0220, i32 1
|
|
%i19 = load <8 x half>, ptr %arrayidx15, align 4
|
|
%.splatinsert16 = insertelement <8 x half> undef, half %i5, i32 0
|
|
%.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i19, <8 x half> %.splat17, <8 x half> %i17)
|
|
%arrayidx18 = getelementptr inbounds half, ptr %pSamples.0220, i32 2
|
|
%i22 = load <8 x half>, ptr %arrayidx18, align 4
|
|
%.splatinsert19 = insertelement <8 x half> undef, half %i6, i32 0
|
|
%.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i22, <8 x half> %.splat20, <8 x half> %i20)
|
|
%arrayidx21 = getelementptr inbounds half, ptr %pSamples.0220, i32 3
|
|
%i25 = load <8 x half>, ptr %arrayidx21, align 4
|
|
%.splatinsert22 = insertelement <8 x half> undef, half %i7, i32 0
|
|
%.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i25, <8 x half> %.splat23, <8 x half> %i23)
|
|
%arrayidx24 = getelementptr inbounds half, ptr %pSamples.0220, i32 4
|
|
%i28 = load <8 x half>, ptr %arrayidx24, align 4
|
|
%.splatinsert25 = insertelement <8 x half> undef, half %i8, i32 0
|
|
%.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i28, <8 x half> %.splat26, <8 x half> %i26)
|
|
%arrayidx27 = getelementptr inbounds half, ptr %pSamples.0220, i32 5
|
|
%i31 = load <8 x half>, ptr %arrayidx27, align 4
|
|
%.splatinsert28 = insertelement <8 x half> undef, half %i9, i32 0
|
|
%.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i31, <8 x half> %.splat29, <8 x half> %i29)
|
|
%arrayidx30 = getelementptr inbounds half, ptr %pSamples.0220, i32 6
|
|
%i34 = load <8 x half>, ptr %arrayidx30, align 4
|
|
%.splatinsert31 = insertelement <8 x half> undef, half %i10, i32 0
|
|
%.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i34, <8 x half> %.splat32, <8 x half> %i32)
|
|
%arrayidx33 = getelementptr inbounds half, ptr %pSamples.0220, i32 7
|
|
%i37 = load <8 x half>, ptr %arrayidx33, align 4
|
|
%.splatinsert34 = insertelement <8 x half> undef, half %i11, i32 0
|
|
%.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i37, <8 x half> %.splat35, <8 x half> %i35)
|
|
%pSamples.1200 = getelementptr inbounds half, ptr %pSamples.0220, i32 8
|
|
br i1 %cmp38201, label %for.body, label %for.end
|
|
|
|
for.body: ; preds = %for.body, %while.body
|
|
%pSamples.1207 = phi ptr [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
|
|
%pCoeffsCur.0206 = phi ptr [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
|
|
%.pn205 = phi ptr [ %pCoeffsCur.0206, %for.body ], [ %i1, %while.body ]
|
|
%i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
|
|
%vecAcc0.0203 = phi <8 x half> [ %i70, %for.body ], [ %i38, %while.body ]
|
|
%pSamples.0.pn202 = phi ptr [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
|
|
%incdec.ptr40 = getelementptr inbounds half, ptr %.pn205, i32 9
|
|
%i39 = load half, ptr %pCoeffsCur.0206, align 4
|
|
%incdec.ptr41 = getelementptr inbounds half, ptr %.pn205, i32 10
|
|
%i40 = load half, ptr %incdec.ptr40, align 4
|
|
%incdec.ptr42 = getelementptr inbounds half, ptr %.pn205, i32 11
|
|
%i41 = load half, ptr %incdec.ptr41, align 4
|
|
%incdec.ptr43 = getelementptr inbounds half, ptr %.pn205, i32 12
|
|
%i42 = load half, ptr %incdec.ptr42, align 4
|
|
%incdec.ptr44 = getelementptr inbounds half, ptr %.pn205, i32 13
|
|
%i43 = load half, ptr %incdec.ptr43, align 4
|
|
%incdec.ptr45 = getelementptr inbounds half, ptr %.pn205, i32 14
|
|
%i44 = load half, ptr %incdec.ptr44, align 4
|
|
%incdec.ptr46 = getelementptr inbounds half, ptr %.pn205, i32 15
|
|
%i45 = load half, ptr %incdec.ptr45, align 4
|
|
%i46 = load half, ptr %incdec.ptr46, align 4
|
|
%i48 = load <8 x half>, ptr %pSamples.1207, align 4
|
|
%.splatinsert48 = insertelement <8 x half> undef, half %i39, i32 0
|
|
%.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203)
|
|
%arrayidx50 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 9
|
|
%i51 = load <8 x half>, ptr %arrayidx50, align 4
|
|
%.splatinsert51 = insertelement <8 x half> undef, half %i40, i32 0
|
|
%.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i51, <8 x half> %.splat52, <8 x half> %i49)
|
|
%arrayidx53 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 10
|
|
%i54 = load <8 x half>, ptr %arrayidx53, align 4
|
|
%.splatinsert54 = insertelement <8 x half> undef, half %i41, i32 0
|
|
%.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i54, <8 x half> %.splat55, <8 x half> %i52)
|
|
%arrayidx56 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 11
|
|
%i57 = load <8 x half>, ptr %arrayidx56, align 4
|
|
%.splatinsert57 = insertelement <8 x half> undef, half %i42, i32 0
|
|
%.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i57, <8 x half> %.splat58, <8 x half> %i55)
|
|
%arrayidx59 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 12
|
|
%i60 = load <8 x half>, ptr %arrayidx59, align 4
|
|
%.splatinsert60 = insertelement <8 x half> undef, half %i43, i32 0
|
|
%.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i60, <8 x half> %.splat61, <8 x half> %i58)
|
|
%arrayidx62 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 13
|
|
%i63 = load <8 x half>, ptr %arrayidx62, align 4
|
|
%.splatinsert63 = insertelement <8 x half> undef, half %i44, i32 0
|
|
%.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i63, <8 x half> %.splat64, <8 x half> %i61)
|
|
%arrayidx65 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 14
|
|
%i66 = load <8 x half>, ptr %arrayidx65, align 4
|
|
%.splatinsert66 = insertelement <8 x half> undef, half %i45, i32 0
|
|
%.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i66, <8 x half> %.splat67, <8 x half> %i64)
|
|
%arrayidx68 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 15
|
|
%i69 = load <8 x half>, ptr %arrayidx68, align 4
|
|
%.splatinsert69 = insertelement <8 x half> undef, half %i46, i32 0
|
|
%.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i69, <8 x half> %.splat70, <8 x half> %i67)
|
|
%inc = add nuw nsw i32 %i.0204, 1
|
|
%pCoeffsCur.0 = getelementptr inbounds half, ptr %pCoeffsCur.0206, i32 8
|
|
%pSamples.1 = getelementptr inbounds half, ptr %pSamples.1207, i32 8
|
|
%exitcond = icmp eq i32 %inc, %smax
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %while.body
|
|
%vecAcc0.0.lcssa = phi <8 x half> [ %i38, %while.body ], [ %i70, %for.body ]
|
|
%pCoeffsCur.0.lcssa = phi ptr [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
|
|
%pSamples.1.lcssa = phi ptr [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
|
|
br i1 %cmp74210, label %while.end, label %while.body76
|
|
|
|
while.body76: ; preds = %while.body76, %for.end
|
|
%pCoeffsCur.1214 = phi ptr [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
|
|
%vecAcc0.1213 = phi <8 x half> [ %i74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
|
|
%numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
|
|
%pSamples.2211 = phi ptr [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
|
|
%incdec.ptr77 = getelementptr inbounds half, ptr %pCoeffsCur.1214, i32 1
|
|
%i71 = load half, ptr %pCoeffsCur.1214, align 4
|
|
%i73 = load <8 x half>, ptr %pSamples.2211, align 4
|
|
%.splatinsert78 = insertelement <8 x half> undef, half %i71, i32 0
|
|
%.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%i74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213)
|
|
%incdec.ptr80 = getelementptr inbounds half, ptr %pSamples.2211, i32 1
|
|
%dec = add nsw i32 %numCnt.0212, -1
|
|
%cmp74 = icmp sgt i32 %numCnt.0212, 1
|
|
br i1 %cmp74, label %while.body76, label %while.end.loopexit
|
|
|
|
while.end.loopexit: ; preds = %while.body76
|
|
%scevgep = getelementptr half, ptr %pSamples.1.lcssa, i32 %and
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %while.end.loopexit, %for.end
|
|
%pSamples.2.lcssa = phi ptr [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
|
|
%vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %i74, %while.end.loopexit ]
|
|
store <8 x half> %vecAcc0.1.lcssa, ptr %pOutput.0218, align 4
|
|
%add.ptr81 = getelementptr inbounds half, ptr %pOutput.0218, i32 4
|
|
%add.ptr82 = getelementptr inbounds half, ptr %pSamples.2.lcssa, i32 4
|
|
%add.ptr83 = getelementptr inbounds half, ptr %add.ptr82, i32 %idx.neg
|
|
%dec84 = add nsw i32 %blkCnt.0222, -1
|
|
%cmp5 = icmp eq i32 %dec84, 0
|
|
br i1 %cmp5, label %if.end, label %while.body
|
|
|
|
if.end: ; preds = %while.end, %if.then, %entry
|
|
ret void
|
|
}
|
|
|
|
%struct.arm_biquad_cascade_df2T_instance_f16 = type { i8, ptr, ptr }
|
|
define void @arm_biquad_cascade_df2T_f16(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: arm_biquad_cascade_df2T_f16:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: ldrd r6, r12, [r0, #4]
|
|
; CHECK-NEXT: ldrb.w r9, [r0]
|
|
; CHECK-NEXT: vldr.16 s0, .LCPI17_0
|
|
; CHECK-NEXT: lsr.w r8, r3, #1
|
|
; CHECK-NEXT: b .LBB17_3
|
|
; CHECK-NEXT: .LBB17_1: @ %if.else
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
; CHECK-NEXT: vmovx.f16 s5, s4
|
|
; CHECK-NEXT: vstr.16 s4, [r6]
|
|
; CHECK-NEXT: .LBB17_2: @ %if.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
; CHECK-NEXT: vstr.16 s5, [r6, #2]
|
|
; CHECK-NEXT: add.w r12, r12, #10
|
|
; CHECK-NEXT: adds r6, #4
|
|
; CHECK-NEXT: subs.w r9, r9, #1
|
|
; CHECK-NEXT: mov r1, r2
|
|
; CHECK-NEXT: beq .LBB17_8
|
|
; CHECK-NEXT: .LBB17_3: @ %do.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB17_5 Depth 2
|
|
; CHECK-NEXT: vldrh.u16 q2, [r12]
|
|
; CHECK-NEXT: movs r5, #0
|
|
; CHECK-NEXT: vmov q4, q2
|
|
; CHECK-NEXT: vshlc q4, r5, #16
|
|
; CHECK-NEXT: vldrh.u16 q3, [r12, #4]
|
|
; CHECK-NEXT: vmov q5, q3
|
|
; CHECK-NEXT: vshlc q5, r5, #16
|
|
; CHECK-NEXT: vldrh.u16 q1, [r6]
|
|
; CHECK-NEXT: vmov.f32 s5, s1
|
|
; CHECK-NEXT: mov r5, r2
|
|
; CHECK-NEXT: wls lr, r8, .LBB17_6
|
|
; CHECK-NEXT: @ %bb.4: @ %while.body.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
; CHECK-NEXT: mov r5, r2
|
|
; CHECK-NEXT: .LBB17_5: @ %while.body
|
|
; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldrh r7, [r1], #4
|
|
; CHECK-NEXT: vmov r0, s0
|
|
; CHECK-NEXT: vfma.f16 q1, q2, r7
|
|
; CHECK-NEXT: ldrh r4, [r1, #-2]
|
|
; CHECK-NEXT: vmov.u16 r7, q1[0]
|
|
; CHECK-NEXT: vfma.f16 q1, q3, r7
|
|
; CHECK-NEXT: vins.f16 s5, s0
|
|
; CHECK-NEXT: vfma.f16 q1, q4, r4
|
|
; CHECK-NEXT: vmov.u16 r4, q1[1]
|
|
; CHECK-NEXT: vfma.f16 q1, q5, r4
|
|
; CHECK-NEXT: strh r4, [r5, #2]
|
|
; CHECK-NEXT: vmov.f32 s4, s5
|
|
; CHECK-NEXT: strh r7, [r5], #4
|
|
; CHECK-NEXT: vmov.16 q1[2], r0
|
|
; CHECK-NEXT: le lr, .LBB17_5
|
|
; CHECK-NEXT: .LBB17_6: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
; CHECK-NEXT: lsls r0, r3, #31
|
|
; CHECK-NEXT: beq .LBB17_1
|
|
; CHECK-NEXT: @ %bb.7: @ %if.then
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
; CHECK-NEXT: ldrh r0, [r1]
|
|
; CHECK-NEXT: vfma.f16 q1, q2, r0
|
|
; CHECK-NEXT: vmov.u16 r0, q1[0]
|
|
; CHECK-NEXT: vfma.f16 q1, q3, r0
|
|
; CHECK-NEXT: strh r0, [r5]
|
|
; CHECK-NEXT: vmovx.f16 s2, s4
|
|
; CHECK-NEXT: vstr.16 s2, [r6]
|
|
; CHECK-NEXT: b .LBB17_2
|
|
; CHECK-NEXT: .LBB17_8: @ %do.end
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
|
; CHECK-NEXT: .p2align 1
|
|
; CHECK-NEXT: @ %bb.9:
|
|
; CHECK-NEXT: .LCPI17_0:
|
|
; CHECK-NEXT: .short 0x0000 @ half 0
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 1
|
|
%i = load ptr, ptr %pState1, align 4
|
|
%numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 0
|
|
%i1 = load i8, ptr %numStages, align 4
|
|
%conv = zext i8 %i1 to i32
|
|
%pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 2
|
|
%i2 = load ptr, ptr %pCoeffs, align 4
|
|
%div = lshr i32 %blockSize, 1
|
|
%cmp.not90 = icmp eq i32 %div, 0
|
|
%and = and i32 %blockSize, 1
|
|
%tobool.not = icmp eq i32 %and, 0
|
|
br label %do.body
|
|
|
|
do.body: ; preds = %if.end, %entry
|
|
%stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ]
|
|
%pCurCoeffs.0 = phi ptr [ %i2, %entry ], [ %add.ptr2, %if.end ]
|
|
%pState.0 = phi ptr [ %i, %entry ], [ %pState.1, %if.end ]
|
|
%pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end ]
|
|
%i4 = load <8 x half>, ptr %pCurCoeffs.0, align 2
|
|
%add.ptr = getelementptr inbounds half, ptr %pCurCoeffs.0, i32 2
|
|
%i6 = load <8 x half>, ptr %add.ptr, align 2
|
|
%add.ptr2 = getelementptr inbounds half, ptr %pCurCoeffs.0, i32 5
|
|
%i8 = load <8 x half>, ptr %pState.0, align 2
|
|
%i9 = shufflevector <8 x half> %i8, <8 x half> <half poison, half poison, half 0xH0000, half 0xH0000, half poison, half poison, half poison, half poison>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
|
|
%i10 = bitcast <8 x half> %i4 to <8 x i16>
|
|
%i11 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %i10, i32 0, i32 16)
|
|
%i12 = extractvalue { i32, <8 x i16> } %i11, 0
|
|
%i13 = extractvalue { i32, <8 x i16> } %i11, 1
|
|
%i14 = bitcast <8 x i16> %i13 to <8 x half>
|
|
%i15 = bitcast <8 x half> %i6 to <8 x i16>
|
|
%i16 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %i15, i32 %i12, i32 16)
|
|
%i17 = extractvalue { i32, <8 x i16> } %i16, 1
|
|
%i18 = bitcast <8 x i16> %i17 to <8 x half>
|
|
br i1 %cmp.not90, label %while.end, label %while.body
|
|
|
|
while.body: ; preds = %while.body, %do.body
|
|
%pIn.194 = phi ptr [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ]
|
|
%state.093 = phi <8 x half> [ %i30, %while.body ], [ %i9, %do.body ]
|
|
%pOut.192 = phi ptr [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ]
|
|
%sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ]
|
|
%incdec.ptr = getelementptr inbounds half, ptr %pIn.194, i32 1
|
|
%i19 = load half, ptr %pIn.194, align 2
|
|
%incdec.ptr4 = getelementptr inbounds half, ptr %pIn.194, i32 2
|
|
%i20 = load half, ptr %incdec.ptr, align 2
|
|
%.splatinsert = insertelement <8 x half> poison, half %i19, i32 0
|
|
%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> poison, <8 x i32> zeroinitializer
|
|
%i21 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i4, <8 x half> %.splat, <8 x half> %state.093)
|
|
%i22 = extractelement <8 x half> %i21, i32 0
|
|
%.splat6 = shufflevector <8 x half> %i21, <8 x half> poison, <8 x i32> zeroinitializer
|
|
%i23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i6, <8 x half> %.splat6, <8 x half> %i21)
|
|
%i24 = insertelement <8 x half> %i23, half 0xH0000, i32 3
|
|
%.splatinsert7 = insertelement <8 x half> poison, half %i20, i32 0
|
|
%.splat8 = shufflevector <8 x half> %.splatinsert7, <8 x half> poison, <8 x i32> zeroinitializer
|
|
%i25 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i14, <8 x half> %.splat8, <8 x half> %i24)
|
|
%i26 = extractelement <8 x half> %i25, i32 1
|
|
%.splat10 = shufflevector <8 x half> %i25, <8 x half> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
%i27 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i18, <8 x half> %.splat10, <8 x half> %i25)
|
|
%i28 = shufflevector <8 x half> %i27, <8 x half> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
%i29 = insertelement <8 x half> %i28, half 0xH0000, i32 2
|
|
%i30 = shufflevector <8 x half> %i29, <8 x half> %i27, <8 x i32> <i32 0, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
%incdec.ptr11 = getelementptr inbounds half, ptr %pOut.192, i32 1
|
|
store half %i22, ptr %pOut.192, align 2
|
|
%incdec.ptr12 = getelementptr inbounds half, ptr %pOut.192, i32 2
|
|
store half %i26, ptr %incdec.ptr11, align 2
|
|
%dec = add nsw i32 %sample.091, -1
|
|
%cmp.not = icmp eq i32 %dec, 0
|
|
br i1 %cmp.not, label %while.end, label %while.body
|
|
|
|
while.end: ; preds = %while.body, %do.body
|
|
%pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ]
|
|
%state.0.lcssa = phi <8 x half> [ %i9, %do.body ], [ %i30, %while.body ]
|
|
%pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ]
|
|
br i1 %tobool.not, label %if.else, label %if.then
|
|
|
|
if.then: ; preds = %while.end
|
|
%i31 = load half, ptr %pIn.1.lcssa, align 2
|
|
%.splatinsert14 = insertelement <8 x half> poison, half %i31, i32 0
|
|
%.splat15 = shufflevector <8 x half> %.splatinsert14, <8 x half> poison, <8 x i32> zeroinitializer
|
|
%i32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i4, <8 x half> %.splat15, <8 x half> %state.0.lcssa)
|
|
%i33 = extractelement <8 x half> %i32, i32 0
|
|
%.splat17 = shufflevector <8 x half> %i32, <8 x half> poison, <8 x i32> zeroinitializer
|
|
%i34 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i6, <8 x half> %.splat17, <8 x half> %i32)
|
|
store half %i33, ptr %pOut.1.lcssa, align 2
|
|
%i35 = extractelement <8 x half> %i34, i32 1
|
|
store half %i35, ptr %pState.0, align 2
|
|
%i36 = extractelement <8 x half> %i34, i32 2
|
|
br label %if.end
|
|
|
|
if.else: ; preds = %while.end
|
|
%i37 = extractelement <8 x half> %state.0.lcssa, i32 0
|
|
store half %i37, ptr %pState.0, align 2
|
|
%i38 = extractelement <8 x half> %state.0.lcssa, i32 1
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %if.else, %if.then
|
|
%.sink = phi half [ %i38, %if.else ], [ %i36, %if.then ]
|
|
%i39 = getelementptr inbounds half, ptr %pState.0, i32 1
|
|
store half %.sink, ptr %i39, align 2
|
|
%pState.1 = getelementptr inbounds half, ptr %pState.0, i32 2
|
|
%dec23 = add i32 %stage.0, -1
|
|
%cmp24.not = icmp eq i32 %dec23, 0
|
|
br i1 %cmp24.not, label %do.end, label %do.body
|
|
|
|
do.end: ; preds = %if.end
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc half @vecAddAcrossF16Mve(<8 x half> %in) {
|
|
; CHECK-LABEL: vecAddAcrossF16Mve:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vrev32.16 q1, q0
|
|
; CHECK-NEXT: vadd.f16 q0, q1, q0
|
|
; CHECK-NEXT: vrev64.32 q1, q0
|
|
; CHECK-NEXT: vadd.f16 q0, q0, q1
|
|
; CHECK-NEXT: vadd.f16 s0, s0, s2
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%i = shufflevector <8 x half> %in, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
|
|
%i1 = fadd fast <8 x half> %i, %in
|
|
%i2 = bitcast <8 x half> %i1 to <4 x i32>
|
|
%i3 = shufflevector <4 x i32> %i2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
|
|
%i4 = bitcast <4 x i32> %i3 to <8 x half>
|
|
%i5 = fadd fast <8 x half> %i1, %i4
|
|
%i6 = extractelement <8 x half> %i5, i32 0
|
|
%i7 = extractelement <8 x half> %i5, i32 4
|
|
%add = fadd fast half %i6, %i7
|
|
ret half %add
|
|
}
|
|
|
|
declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32)
|
|
declare void @llvm.assume(i1)
|
|
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
|
|
declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
|
|
declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)
|