Currently when tail predicating loops, vpt blocks need to be created with the vctp predicate in case we need to revert to non-tail predicated form. This has the unfortunate side effect of severely hampering post-ra scheduling at times as the instructions are already stuck in vpt blocks, not allowed to be independently ordered. This patch addresses that by just moving the creation of VPT blocks later in the pipeline, after post-ra scheduling has been performed. This allows more optimal scheduling post-ra before the vpt blocks are created, leading to more optimal tail predicated loops. Differential Revision: https://reviews.llvm.org/D113094
108 lines
4.5 KiB
LLVM
108 lines
4.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
; This test has an instruction that gets sunk into the loop, that is a
|
|
; active.lane.mask operand. (%exitcount.ptrcnt.to.int = ptrtoint). We
|
|
; need to make sure it is loop invariant.
|
|
|
|
define i32 @a(i32* readnone %b, i8* %c) {
|
|
; CHECK-LABEL: a:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, lr}
|
|
; CHECK-NEXT: push {r4, lr}
|
|
; CHECK-NEXT: cmp r0, r1
|
|
; CHECK-NEXT: it ls
|
|
; CHECK-NEXT: popls {r4, pc}
|
|
; CHECK-NEXT: .LBB0_1: @ %while.body.preheader
|
|
; CHECK-NEXT: subs r4, r0, r1
|
|
; CHECK-NEXT: movs r2, #0
|
|
; CHECK-NEXT: mov r3, r1
|
|
; CHECK-NEXT: dlstp.8 lr, r4
|
|
; CHECK-NEXT: .LBB0_2: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: adds r0, r1, r2
|
|
; CHECK-NEXT: vidup.u8 q0, r0, #1
|
|
; CHECK-NEXT: adds r2, #16
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #16
|
|
; CHECK-NEXT: letp lr, .LBB0_2
|
|
; CHECK-NEXT: @ %bb.3: @ %while.end
|
|
; CHECK-NEXT: pop {r4, pc}
|
|
entry:
|
|
%0 = bitcast i32* %b to i8*
|
|
%cmp3 = icmp ugt i8* %0, %c
|
|
br i1 %cmp3, label %while.body.preheader, label %while.end
|
|
|
|
while.body.preheader: ; preds = %entry
|
|
%c5 = ptrtoint i8* %c to i32
|
|
%1 = sub i32 0, %c5
|
|
%uglygep = getelementptr i8, i8* %0, i32 %1
|
|
%exitcount.ptrcnt.to.int = ptrtoint i8* %uglygep to i32
|
|
%n.rnd.up = add i32 %exitcount.ptrcnt.to.int, 15
|
|
%n.vec = and i32 %n.rnd.up, -16
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %while.body.preheader
|
|
%index = phi i32 [ 0, %while.body.preheader ], [ %index.next, %vector.body ]
|
|
%next.gep = getelementptr i8, i8* %c, i32 %index
|
|
%2 = or i32 %index, 1
|
|
%next.gep7 = getelementptr i8, i8* %c, i32 %2
|
|
%3 = or i32 %index, 2
|
|
%next.gep8 = getelementptr i8, i8* %c, i32 %3
|
|
%4 = or i32 %index, 3
|
|
%next.gep9 = getelementptr i8, i8* %c, i32 %4
|
|
%5 = or i32 %index, 4
|
|
%next.gep10 = getelementptr i8, i8* %c, i32 %5
|
|
%6 = or i32 %index, 5
|
|
%next.gep11 = getelementptr i8, i8* %c, i32 %6
|
|
%7 = or i32 %index, 6
|
|
%next.gep12 = getelementptr i8, i8* %c, i32 %7
|
|
%8 = or i32 %index, 7
|
|
%next.gep13 = getelementptr i8, i8* %c, i32 %8
|
|
%9 = or i32 %index, 8
|
|
%next.gep14 = getelementptr i8, i8* %c, i32 %9
|
|
%10 = or i32 %index, 9
|
|
%next.gep15 = getelementptr i8, i8* %c, i32 %10
|
|
%11 = or i32 %index, 10
|
|
%next.gep16 = getelementptr i8, i8* %c, i32 %11
|
|
%12 = or i32 %index, 11
|
|
%next.gep17 = getelementptr i8, i8* %c, i32 %12
|
|
%13 = or i32 %index, 12
|
|
%next.gep18 = getelementptr i8, i8* %c, i32 %13
|
|
%14 = or i32 %index, 13
|
|
%next.gep19 = getelementptr i8, i8* %c, i32 %14
|
|
%15 = or i32 %index, 14
|
|
%next.gep20 = getelementptr i8, i8* %c, i32 %15
|
|
%16 = or i32 %index, 15
|
|
%next.gep21 = getelementptr i8, i8* %c, i32 %16
|
|
%17 = insertelement <16 x i8*> poison, i8* %next.gep, i32 0
|
|
%18 = insertelement <16 x i8*> %17, i8* %next.gep7, i32 1
|
|
%19 = insertelement <16 x i8*> %18, i8* %next.gep8, i32 2
|
|
%20 = insertelement <16 x i8*> %19, i8* %next.gep9, i32 3
|
|
%21 = insertelement <16 x i8*> %20, i8* %next.gep10, i32 4
|
|
%22 = insertelement <16 x i8*> %21, i8* %next.gep11, i32 5
|
|
%23 = insertelement <16 x i8*> %22, i8* %next.gep12, i32 6
|
|
%24 = insertelement <16 x i8*> %23, i8* %next.gep13, i32 7
|
|
%25 = insertelement <16 x i8*> %24, i8* %next.gep14, i32 8
|
|
%26 = insertelement <16 x i8*> %25, i8* %next.gep15, i32 9
|
|
%27 = insertelement <16 x i8*> %26, i8* %next.gep16, i32 10
|
|
%28 = insertelement <16 x i8*> %27, i8* %next.gep17, i32 11
|
|
%29 = insertelement <16 x i8*> %28, i8* %next.gep18, i32 12
|
|
%30 = insertelement <16 x i8*> %29, i8* %next.gep19, i32 13
|
|
%31 = insertelement <16 x i8*> %30, i8* %next.gep20, i32 14
|
|
%32 = insertelement <16 x i8*> %31, i8* %next.gep21, i32 15
|
|
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %exitcount.ptrcnt.to.int)
|
|
%33 = ptrtoint <16 x i8*> %32 to <16 x i32>
|
|
%34 = trunc <16 x i32> %33 to <16 x i8>
|
|
%35 = bitcast i8* %next.gep to <16 x i8>*
|
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %34, <16 x i8>* %35, i32 1, <16 x i1> %active.lane.mask)
|
|
%index.next = add i32 %index, 16
|
|
%36 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %36, label %while.end, label %vector.body
|
|
|
|
while.end: ; preds = %vector.body, %entry
|
|
ret i32 undef
|
|
}
|
|
|
|
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
|
|
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
|