Follow-up to https://github.com/llvm/llvm-project/pull/169576 to enable UDiv canonicalization if the start of the AddRec is not constant. The fold is not restricted to constant start values, as long as we are able to compute a constant remainder. The fold is only applied if the subtraction of the remainder can be folded into to start expression, but that is just to avoid creating more complex AddRecs. For reference, the proof from #169576 is https://alive2.llvm.org/ce/z/iu2tav PR: https://github.com/llvm/llvm-project/pull/170474
85 lines
2.8 KiB
LLVM
85 lines
2.8 KiB
LLVM
; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
|
|
; REQUIRES: asserts
|
|
|
|
; CHECK: 'foo'
|
|
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %shift = ashr i32 %val, %k
|
|
; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
|
|
; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
|
|
define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr {
|
|
entry:
|
|
br label %body
|
|
|
|
body:
|
|
%i = phi i64 [ 0, %entry ], [ %next, %body ]
|
|
%ptr = getelementptr inbounds i32, ptr %p, i64 %i
|
|
%val = load i32, ptr %ptr, align 4
|
|
%shift = ashr i32 %val, %k
|
|
store i32 %shift, ptr %ptr, align 4
|
|
%next = add nuw nsw i64 %i, 1
|
|
%cmp = icmp eq i64 %next, 16
|
|
br i1 %cmp, label %exit, label %body
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
; CHECK: 'shift_and_masked_load_store'
|
|
; CHECK: Cost of 1 for VF 2: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2>
|
|
; CHECK: Cost of 1 for VF 4: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2>
|
|
; CHECK: Cost of 4 for VF 8: WIDEN ir<%shifted> = lshr ir<%iv>, ir<2>
|
|
define void @shift_and_masked_load_store(i64 %trip.count) #0 {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%shifted = lshr i64 %iv, 2
|
|
%masked.idx = and i64 %shifted, 1
|
|
%load.ptr = getelementptr i16, ptr poison, i64 %masked.idx
|
|
%val = load i16, ptr %load.ptr, align 2
|
|
%store.idx = shl nuw i64 %iv, 2
|
|
%store.ptr = getelementptr i8, ptr poison, i64 %store.idx
|
|
store i16 %val, ptr %store.ptr, align 2
|
|
%iv.next = add i64 %iv, 1
|
|
%cmp = icmp eq i64 %iv, %trip.count
|
|
br i1 %cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
define i64 @sdiv_arg_outer_iv(ptr noalias %dst, ptr %src) {
|
|
; CHECK: 'sdiv_arg_outer_iv'
|
|
; CHECK: Cost of 0 for VF 2: CLONE ir<%div> = sdiv ir<%add.offset>, ir<8>
|
|
; CHECK: Cost of 0 for VF 4: CLONE ir<%div> = sdiv ir<%add.offset>, ir<8>
|
|
; CHECK: Cost of 0 for VF 8: CLONE ir<%div> = sdiv ir<%add.offset>, ir<8>
|
|
; CHECK: Cost of 0 for VF 16: REPLICATE ir<%div> = sdiv ir<%add.offset>, ir<8>
|
|
entry:
|
|
br label %outer.header
|
|
|
|
outer.header:
|
|
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
|
|
%offset = shl nsw i32 %outer.iv, 7
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %outer.header ], [ %iv.next, %loop ]
|
|
%iv.trunc = trunc i64 %iv to i32
|
|
%add.offset = add i32 %offset, %iv.trunc
|
|
%div = sdiv i32 %add.offset, 8
|
|
%div.ext = sext i32 %div to i64
|
|
%gep.src = getelementptr i8, ptr %src, i64 %div.ext
|
|
%l = load i8, ptr %gep.src, align 1
|
|
%gep.dst = getelementptr i8, ptr %dst, i64 %iv
|
|
store i8 %l, ptr %gep.dst, align 1
|
|
%iv.next = add i64 %iv, 1
|
|
%ec = icmp eq i64 %iv, 64
|
|
br i1 %ec, label %outer.latch, label %loop
|
|
|
|
outer.latch:
|
|
%outer.iv.next = add nsw i32 %outer.iv, 1
|
|
br label %outer.header
|
|
}
|
|
|
|
attributes #0 = { "target-features"="+avx2" "tune-cpu"="alderlake" }
|