llvm-project/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
Philip Reames f79214d1e1 [LV] Support predicated div/rem operations via safe-divisor select idiom
This patch adds support for vectorizing conditionally executed div/rem operations via a variant of widening. The existing support for predicated divrem in the vectorizer requires scalarization which we can't do for scalable vectors.

The basic idea is that we can always divide (take remainder) by 1 without executing UB. As such, we can use the active lane mask to conditional select either the actual divisor for active lanes, or a constant one for inactive lanes. We already account for the cost of the active lane mask, so the only additional cost is a splat of one and the vector select. This is one of several possible approaches to this problem; see the review thread for discussion on some of the others.  This one was chosen mostly because it was straight forward, and none of the others seemed oviously better.

I enabled the new code only for scalable vectors. We could also legally enable it for fixed vectors as well, but I haven't thought through the cost tradeoffs between widening and scalarization enough to know if that's profitable. This will be explored in future patches.

Differential Revision: https://reviews.llvm.org/D130164
2022-08-24 10:07:59 -07:00

92 lines
3.1 KiB
LLVM

; RUN: opt < %s -loop-vectorize -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
; The test predication_in_loop corresponds
; to the following function
; for (long long i = 0; i < 1024; i++) {
; if (cond[i])
; a[i] /= b[i];
; }
define void @predication_in_loop(i32* %a, i32* %b, i32* %cond) #0 {
; CHECK-LABEL: @predication_in_loop
; CHECK: sdiv <vscale x 4 x i32>
;
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.inc, %entry
ret void
for.body: ; preds = %entry, %for.inc
%i.09 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %cond, i64 %i.09
%0 = load i32, i32* %arrayidx, align 4
%tobool.not = icmp eq i32 %0, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
%arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %i.09
%1 = load i32, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.09
%2 = load i32, i32* %arrayidx2, align 4
%div = sdiv i32 %2, %1
store i32 %div, i32* %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i64 %i.09, 1
%exitcond.not = icmp eq i64 %inc, 1024
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
;
; This test unpredicated_loop_predication_through_tailfolding corresponds
; to the following function
; for (long long i = 0; i < 1024; i++) {
; a[i] /= b[i];
; }
; Scalarization not possible in the main loop when there is no predication and
; epilogue should not be able to allow scalarization
; otherwise it could be able to vectorize, but will not because
; "Max legal vector width too small, scalable vectorization unfeasible.."
define void @unpredicated_loop_predication_through_tailfolding(i32* %a, i32* %b) #0 {
; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding
; CHECK-NOT: sdiv <vscale x 4 x i32>
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%sdiv = sdiv i32 %1, %0
%2 = add nuw nsw i64 %iv, 8
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %sdiv, i32* %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
exit:
ret void
}
attributes #0 = { "target-features"="+sve" }
!0 = distinct !{!0, !1, !2, !3, !4}
!1 = !{!"llvm.loop.vectorize.width", i32 4}
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}