
The current version of the test doesn't use any of the loads, so they can be removed together with the mask of the interleave group. Use some loaded values and store them, to prevent the mask from being optimized away.
98 lines
3.6 KiB
LLVM
98 lines
3.6 KiB
LLVM
; RUN: opt -passes=loop-vectorize -hexagon-autohvx=1 -force-vector-width=64 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
|
|
target triple = "hexagon"
|
|
|
|
; Test for PR45572.
|
|
|
|
; Check that interleave groups and decisions based on them are correctly
|
|
; invalidated with tail-folding on platforms where masked interleaved accesses
|
|
; are disabled.
|
|
|
|
; Make sure a vector body has been created, 64 element vectors are used and a block predicate has been computed.
|
|
; Also make sure the loads are not widened.
|
|
|
|
; CHECK-LABEL: @test1
|
|
; CHECK: vector.body:
|
|
; CHECK: icmp ule <64 x i32> %vec.ind
|
|
; CHECK-NOT: load <{{.*}} x i32>
|
|
|
|
|
|
define void @test1(ptr %arg, i32 %N) #0 {
|
|
entry:
|
|
%tmp = alloca i32
|
|
br label %loop
|
|
|
|
loop: ; preds = %bb2, %bb
|
|
%iv = phi i32 [ %iv.next, %loop], [ 0, %entry ]
|
|
%idx.mul = mul nuw nsw i32 %iv, 7
|
|
%idx.start = add nuw nsw i32 %idx.mul, 1
|
|
%tmp6 = getelementptr inbounds i32, ptr %arg, i32 %idx.start
|
|
%tmp7 = load i32, ptr %tmp6, align 4
|
|
%tmp8 = add nuw nsw i32 %idx.start, 1
|
|
%tmp9 = getelementptr inbounds i32, ptr %arg, i32 %tmp8
|
|
%tmp10 = load i32, ptr %tmp9, align 4
|
|
%tmp11 = add nuw nsw i32 %idx.start, 2
|
|
%tmp12 = getelementptr inbounds i32, ptr %arg, i32 %tmp11
|
|
%tmp13 = load i32, ptr %tmp12, align 4
|
|
%tmp14 = add nuw nsw i32 %idx.start, 3
|
|
%tmp15 = getelementptr inbounds i32, ptr %arg, i32 %tmp14
|
|
%tmp16 = load i32, ptr %tmp15, align 4
|
|
%tmp18 = add nuw nsw i32 %idx.start, 4
|
|
%tmp19 = getelementptr inbounds i32, ptr %arg, i32 %tmp18
|
|
%tmp20 = load i32, ptr %tmp19, align 4
|
|
%tmp21 = add nuw nsw i32 %idx.start, 5
|
|
%tmp22 = getelementptr inbounds i32, ptr %arg, i32 %tmp21
|
|
%tmp23 = load i32, ptr %tmp22, align 4
|
|
%tmp25 = add nuw nsw i32 %idx.start, 6
|
|
%tmp26 = getelementptr inbounds i32, ptr %arg, i32 %tmp25
|
|
%tmp27 = load i32, ptr %tmp26, align 4
|
|
%add = add i32 %tmp7, %tmp27
|
|
store i32 %add, ptr %tmp, align 1
|
|
%iv.next= add nuw nsw i32 %iv, 1
|
|
%exit.cond = icmp eq i32 %iv.next, %N
|
|
br i1 %exit.cond, label %exit, label %loop
|
|
|
|
exit: ; preds = %loop
|
|
ret void
|
|
}
|
|
|
|
; The loop below only requires tail folding due to interleave groups with gaps.
|
|
; Make sure the loads are not widened.
|
|
|
|
; CHECK-LABEL: @test2
|
|
; CHECK: vector.body:
|
|
; CHECK-NOT: load <{{.*}} x i32>
|
|
define void @test2(ptr %arg) #1 {
|
|
entry:
|
|
%tmp = alloca i32
|
|
br label %loop
|
|
|
|
loop: ; preds = %bb2, %bb
|
|
%iv = phi i32 [ %iv.next, %loop], [ 0, %entry ]
|
|
%idx.start = mul nuw nsw i32 %iv, 5
|
|
%tmp6 = getelementptr inbounds i32, ptr %arg, i32 %idx.start
|
|
%tmp7 = load i32, ptr %tmp6, align 4
|
|
%tmp8 = add nuw nsw i32 %idx.start, 1
|
|
%tmp9 = getelementptr inbounds i32, ptr %arg, i32 %tmp8
|
|
%tmp10 = load i32, ptr %tmp9, align 4
|
|
%tmp11 = add nuw nsw i32 %idx.start, 2
|
|
%tmp12 = getelementptr inbounds i32, ptr %arg, i32 %tmp11
|
|
%tmp13 = load i32, ptr %tmp12, align 4
|
|
%tmp14 = add nuw nsw i32 %idx.start, 3
|
|
%tmp15 = getelementptr inbounds i32, ptr %arg, i32 %tmp14
|
|
%tmp16 = load i32, ptr %tmp15, align 4
|
|
%add = add i32 %tmp7, %tmp16
|
|
store i32 %add, ptr %tmp, align 1
|
|
%iv.next= add nuw nsw i32 %iv, 1
|
|
%exit.cond = icmp eq i32 %iv.next, 128
|
|
br i1 %exit.cond, label %exit, label %loop
|
|
|
|
exit: ; preds = %loop
|
|
ret void
|
|
}
|
|
|
|
|
|
attributes #0 = { "target-features"="+hvx,+hvx-length128b" }
|
|
attributes #1 = { optsize "target-features"="+hvx,+hvx-length128b" }
|