Move selectInterleaveCount to LoopVectorizationPlanner and retrieve some information directly from VPlan. Register pressure was already computed for a VPlan, and with this patch we now also check for reductions directly on VPlan, as well as checking how many load and store operations remain in the loop. This should be mostly NFC, but we may compute slightly different interleave counts, except for some edge cases, e.g. where dead loads have been removed. This shouldn't happen in practice, and the patch doesn't cause changes across a large test corpus on AArch64. Computing the interleave count based on VPlan allows for making better decisions in presence of VPlan optimizations, for example when operations on interleave groups are narrowed. Note that there are a few test changes for tests that were still checking the legacy cost-model output when it was computed in selectInterleaveCount. PR: https://github.com/llvm/llvm-project/pull/149702
33 lines
1.0 KiB
LLVM
33 lines
1.0 KiB
LLVM
; REQUIRES: asserts
|
|
; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -passes=loop-vectorize \
|
|
; RUN: -force-vector-width=4 -debug-only=loop-vectorize \
|
|
; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
|
|
; RUN: FileCheck %s
|
|
;
|
|
; Check that a scalarized load/store does not get a cost for insterts/
|
|
; extracts, since z13 supports element load/store.
|
|
|
|
define void @fun(ptr %data, i64 %n) {
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
|
|
%tmp0 = getelementptr inbounds i32, ptr %data, i64 %i
|
|
%tmp1 = load i32, ptr %tmp0, align 4
|
|
%tmp2 = add i32 %tmp1, 1
|
|
store i32 %tmp2, ptr %tmp0, align 4
|
|
%i.next = add nuw nsw i64 %i, 2
|
|
%cond = icmp slt i64 %i.next, %n
|
|
br i1 %cond, label %for.body, label %for.end
|
|
|
|
for.end:
|
|
ret void
|
|
|
|
; CHECK: LV: Scalarizing: %tmp1 = load i32, ptr %tmp0, align 4
|
|
; CHECK: LV: Scalarizing: store i32 %tmp2, ptr %tmp0, align 4
|
|
|
|
; CHECK: Cost of 4 for VF 4: REPLICATE ir<%tmp1> = load ir<%tmp0>
|
|
; CHECK: Cost of 4 for VF 4: REPLICATE store ir<%tmp2>, ir<%tmp0>
|
|
}
|