
Re-landing #116970 after fixing miscompilation error. The original change made it possible for CMPZ to have multiple uses; `ARMDAGToDAGISel::SelectCMPZ` was not prepared for this. Pull Request: https://github.com/llvm/llvm-project/pull/118887 Original commit message: Following #116547 and #116676, this PR changes the type of results and operands of some nodes to accept / return a normal type instead of Glue. Unfortunately, changing the result type of one node requires changing the operand types of all potential consumer nodes, which in turn requires changing the result types of all other possible producer nodes. So this is a bulk change.
95 lines
4.5 KiB
LLVM
95 lines
4.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
|
|
%struct.SpeexPreprocessState_ = type { i32, i32, ptr, ptr }
|
|
|
|
define void @foo(ptr nocapture readonly %st, ptr %x) {
|
|
; CHECK-LABEL: foo:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
; CHECK-NEXT: push {r4, lr}
|
|
; CHECK-NEXT: ldrd r12, r3, [r0]
|
|
; CHECK-NEXT: ldrd r4, r2, [r0, #8]
|
|
; CHECK-NEXT: rsb r12, r12, r3, lsl #1
|
|
; CHECK-NEXT: dlstp.16 lr, r12
|
|
; CHECK-NEXT: .LBB0_1: @ %do.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrh.u16 q0, [r2], #16
|
|
; CHECK-NEXT: vstrh.16 q0, [r4], #16
|
|
; CHECK-NEXT: letp lr, .LBB0_1
|
|
; CHECK-NEXT: @ %bb.2: @ %do.end
|
|
; CHECK-NEXT: ldr r3, [r0]
|
|
; CHECK-NEXT: ldr r0, [r0, #8]
|
|
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
|
|
; CHECK-NEXT: mov.w r2, #6144
|
|
; CHECK-NEXT: dlstp.16 lr, r3
|
|
; CHECK-NEXT: .LBB0_3: @ %do.body6
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrh.u16 q0, [r1], #16
|
|
; CHECK-NEXT: vcvt.f16.s16 q0, q0
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r2
|
|
; CHECK-NEXT: vstrh.16 q0, [r0], #16
|
|
; CHECK-NEXT: letp lr, .LBB0_3
|
|
; CHECK-NEXT: @ %bb.4: @ %do.end13
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
entry:
|
|
%ps_size = getelementptr inbounds %struct.SpeexPreprocessState_, ptr %st, i32 0, i32 1
|
|
%0 = load i32, ptr %ps_size, align 4
|
|
%mul = shl nsw i32 %0, 1
|
|
%1 = load i32, ptr %st, align 4
|
|
%sub = sub nsw i32 %mul, %1
|
|
%inbuf = getelementptr inbounds %struct.SpeexPreprocessState_, ptr %st, i32 0, i32 3
|
|
%2 = load ptr, ptr %inbuf, align 4
|
|
%frame = getelementptr inbounds %struct.SpeexPreprocessState_, ptr %st, i32 0, i32 2
|
|
%3 = load ptr, ptr %frame, align 4
|
|
br label %do.body
|
|
|
|
do.body: ; preds = %do.body, %entry
|
|
%pinbuff16.0 = phi ptr [ %2, %entry ], [ %add.ptr, %do.body ]
|
|
%blkCnt.0 = phi i32 [ %sub, %entry ], [ %sub2, %do.body ]
|
|
%pframef16.0 = phi ptr [ %3, %entry ], [ %add.ptr1, %do.body ]
|
|
%4 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0)
|
|
%5 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %pinbuff16.0, i32 2, <8 x i1> %4, <8 x half> zeroinitializer)
|
|
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %5, ptr %pframef16.0, i32 2, <8 x i1> %4)
|
|
%add.ptr = getelementptr inbounds half, ptr %pinbuff16.0, i32 8
|
|
%add.ptr1 = getelementptr inbounds half, ptr %pframef16.0, i32 8
|
|
%sub2 = add nsw i32 %blkCnt.0, -8
|
|
%cmp = icmp sgt i32 %blkCnt.0, 8
|
|
br i1 %cmp, label %do.body, label %do.end
|
|
|
|
do.end: ; preds = %do.body
|
|
%6 = load ptr, ptr %frame, align 4
|
|
%add.ptr4 = getelementptr inbounds half, ptr %6, i32 %sub
|
|
%7 = load i32, ptr %st, align 4
|
|
br label %do.body6
|
|
|
|
do.body6: ; preds = %do.body6, %do.end
|
|
%px.0 = phi ptr [ %x, %do.end ], [ %add.ptr8, %do.body6 ]
|
|
%blkCnt.1 = phi i32 [ %7, %do.end ], [ %sub10, %do.body6 ]
|
|
%pframef16.1 = phi ptr [ %add.ptr4, %do.end ], [ %add.ptr9, %do.body6 ]
|
|
%8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.1)
|
|
%9 = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %px.0, i32 2, <8 x i1> %8, <8 x i16> zeroinitializer)
|
|
%10 = tail call fast <8 x half> @llvm.arm.mve.vcvt.fp.int.predicated.v8f16.v8i16.v8i1(<8 x i16> %9, i32 0, <8 x i1> %8, <8 x half> undef)
|
|
%11 = tail call fast <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> <half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800>, <8 x i1> %8, <8 x half> undef)
|
|
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %11, ptr %pframef16.1, i32 2, <8 x i1> %8)
|
|
%add.ptr8 = getelementptr inbounds i16, ptr %px.0, i32 8
|
|
%add.ptr9 = getelementptr inbounds half, ptr %pframef16.1, i32 8
|
|
%sub10 = add nsw i32 %blkCnt.1, -8
|
|
%cmp12 = icmp sgt i32 %blkCnt.1, 8
|
|
br i1 %cmp12, label %do.body6, label %do.end13
|
|
|
|
do.end13: ; preds = %do.body6
|
|
ret void
|
|
}
|
|
|
|
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
|
|
|
|
declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>)
|
|
|
|
declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)
|
|
|
|
declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
|
|
|
|
declare <8 x half> @llvm.arm.mve.vcvt.fp.int.predicated.v8f16.v8i16.v8i1(<8 x i16>, i32, <8 x i1>, <8 x half>)
|
|
|
|
declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)
|