Sergei Barannikov e0ed0333f0
Reland "[ARM] Stop gluing ALU nodes to branches / selects" (#118887)
Re-landing #116970 after fixing miscompilation error.

The original change made it possible for CMPZ to have multiple uses;
`ARMDAGToDAGISel::SelectCMPZ` was not prepared for this.

Pull Request: https://github.com/llvm/llvm-project/pull/118887


Original commit message:

Following #116547 and #116676, this PR changes the type of results and
operands of some nodes to accept / return a normal type instead of Glue.

Unfortunately, changing the result type of one node requires changing
the operand types of all potential consumer nodes, which in turn
requires changing the result types of all other possible producer nodes.
So this is a bulk change.
2024-12-07 10:14:36 +03:00

95 lines
4.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
%struct.SpeexPreprocessState_ = type { i32, i32, ptr, ptr }
define void @foo(ptr nocapture readonly %st, ptr %x) {
; CHECK-LABEL: foo:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: ldrd r12, r3, [r0]
; CHECK-NEXT: ldrd r4, r2, [r0, #8]
; CHECK-NEXT: rsb r12, r12, r3, lsl #1
; CHECK-NEXT: dlstp.16 lr, r12
; CHECK-NEXT: .LBB0_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r2], #16
; CHECK-NEXT: vstrh.16 q0, [r4], #16
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %do.end
; CHECK-NEXT: ldr r3, [r0]
; CHECK-NEXT: ldr r0, [r0, #8]
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: mov.w r2, #6144
; CHECK-NEXT: dlstp.16 lr, r3
; CHECK-NEXT: .LBB0_3: @ %do.body6
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r1], #16
; CHECK-NEXT: vcvt.f16.s16 q0, q0
; CHECK-NEXT: vmul.f16 q0, q0, r2
; CHECK-NEXT: vstrh.16 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB0_3
; CHECK-NEXT: @ %bb.4: @ %do.end13
; CHECK-NEXT: pop {r4, pc}
entry:
%ps_size = getelementptr inbounds %struct.SpeexPreprocessState_, ptr %st, i32 0, i32 1
%0 = load i32, ptr %ps_size, align 4
%mul = shl nsw i32 %0, 1
%1 = load i32, ptr %st, align 4
%sub = sub nsw i32 %mul, %1
%inbuf = getelementptr inbounds %struct.SpeexPreprocessState_, ptr %st, i32 0, i32 3
%2 = load ptr, ptr %inbuf, align 4
%frame = getelementptr inbounds %struct.SpeexPreprocessState_, ptr %st, i32 0, i32 2
%3 = load ptr, ptr %frame, align 4
br label %do.body
do.body: ; preds = %do.body, %entry
%pinbuff16.0 = phi ptr [ %2, %entry ], [ %add.ptr, %do.body ]
%blkCnt.0 = phi i32 [ %sub, %entry ], [ %sub2, %do.body ]
%pframef16.0 = phi ptr [ %3, %entry ], [ %add.ptr1, %do.body ]
%4 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0)
%5 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %pinbuff16.0, i32 2, <8 x i1> %4, <8 x half> zeroinitializer)
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %5, ptr %pframef16.0, i32 2, <8 x i1> %4)
%add.ptr = getelementptr inbounds half, ptr %pinbuff16.0, i32 8
%add.ptr1 = getelementptr inbounds half, ptr %pframef16.0, i32 8
%sub2 = add nsw i32 %blkCnt.0, -8
%cmp = icmp sgt i32 %blkCnt.0, 8
br i1 %cmp, label %do.body, label %do.end
do.end: ; preds = %do.body
%6 = load ptr, ptr %frame, align 4
%add.ptr4 = getelementptr inbounds half, ptr %6, i32 %sub
%7 = load i32, ptr %st, align 4
br label %do.body6
do.body6: ; preds = %do.body6, %do.end
%px.0 = phi ptr [ %x, %do.end ], [ %add.ptr8, %do.body6 ]
%blkCnt.1 = phi i32 [ %7, %do.end ], [ %sub10, %do.body6 ]
%pframef16.1 = phi ptr [ %add.ptr4, %do.end ], [ %add.ptr9, %do.body6 ]
%8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.1)
%9 = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %px.0, i32 2, <8 x i1> %8, <8 x i16> zeroinitializer)
%10 = tail call fast <8 x half> @llvm.arm.mve.vcvt.fp.int.predicated.v8f16.v8i16.v8i1(<8 x i16> %9, i32 0, <8 x i1> %8, <8 x half> undef)
%11 = tail call fast <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> <half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800>, <8 x i1> %8, <8 x half> undef)
tail call void @llvm.masked.store.v8f16.p0(<8 x half> %11, ptr %pframef16.1, i32 2, <8 x i1> %8)
%add.ptr8 = getelementptr inbounds i16, ptr %px.0, i32 8
%add.ptr9 = getelementptr inbounds half, ptr %pframef16.1, i32 8
%sub10 = add nsw i32 %blkCnt.1, -8
%cmp12 = icmp sgt i32 %blkCnt.1, 8
br i1 %cmp12, label %do.body6, label %do.end13
do.end13: ; preds = %do.body6
ret void
}
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>)
declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)
declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
declare <8 x half> @llvm.arm.mve.vcvt.fp.int.predicated.v8f16.v8i16.v8i1(<8 x i16>, i32, <8 x i1>, <8 x half>)
declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)