llvm-project/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
David Green 352a836176
[InstCombine] Canonicalize non-i8 gep of mul to i8 (#96606)
This is a small canonicalization for `gep i32, p, (mul x, C)` -> `gep
i8, p, (mul x, C*4)`, so that the mul can combine both of the constant
multiplications, and we take a small step towards canonicalizing more
geps to i8.

It currently doesn't attempt to check for multiple uses on the mul, but
that should be possible if it sounds better. Let me know what you think
of the idea in general.
2024-06-26 14:25:54 +01:00

93 lines
5.5 KiB
LLVM

; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF1
; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF2
; CHECKUF1: for.body.preheader:
; CHECKUF1-DAG: %wide.trip.count = zext nneg i32 %N to i64
; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
; CHECKUF1: vector.ph:
; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
; CHECKUF1: vector.body:
; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index
; CHECKUF1: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index
; CHECKUF1: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]]
; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
; CHECKUF2: for.body.preheader:
; CHECKUF2-DAG: %wide.trip.count = zext nneg i32 %N to i64
; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
; CHECKUF2: vector.ph:
; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
; CHECKUF2: vector.body:
; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index
; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 5
; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds i8, ptr %[[IDXB]], i64 %[[VSCALE2]]
; CHECKUF2: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, ptr %[[IDXB_NEXT]], align 8
; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index
; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 5
; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds i8, ptr %[[IDXA]], i64 %[[VSCALE2]]
; CHECKUF2: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], ptr %[[IDXA_NEXT]], align 8
; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]]
; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
define void @loop(i32 %N, ptr nocapture %a, ptr nocapture readonly %b) {
entry:
%cmp7 = icmp sgt i32 %N, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds double, ptr %b, i64 %indvars.iv
%0 = load double, ptr %arrayidx, align 8
%add = fadd double %0, 1.000000e+00
%arrayidx2 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
store double %add, ptr %arrayidx2, align 8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
}
!1 = distinct !{!1, !2}
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}