The `masked.load`, `masked.store`, `masked.gather` and `masked.scatter` intrinsics currently accept a separate alignment immarg. Replace this with an `align` attribute on the pointer / vector of pointers argument. This is the standard representation for alignment information on intrinsics, and is already used by all other memory intrinsics. This means the signatures now match llvm.expandload, llvm.vp.load, etc. (Things like llvm.memcpy used to have a separate alignment argument as well, but were already migrated a long time ago.) It's worth noting that the masked.gather and masked.scatter intrinsics previously accepted a zero alignment to indicate the ABI type alignment of the element type. This special case is gone now: If the align attribute is omitted, the implied alignment is 1, as usual. If ABI alignment is desired, it needs to be explicitly emitted (which the IRBuilder API already requires anyway).
1854 lines
102 KiB
LLVM
1854 lines
102 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -passes=loop-vectorize,instcombine,simplifycfg -simplifycfg-require-and-preserve-domtree=1 -tail-predication=enabled < %s -S -o - | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
|
target triple = "thumbv8.1m.main-arm-none-eabi"
|
|
|
|
; Should not be vectorized
|
|
define i64 @add_i64_i64(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i64_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i64, ptr [[X:%.*]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[TMP0]], [[R_07]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i64, ptr %x, i32 %i.08
|
|
%0 = load i64, ptr %arrayidx, align 8
|
|
%add = add nsw i64 %0, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDLV
|
|
; FIXME: TailPredicate
|
|
define i64 @add_i32_i64(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i32_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
|
|
; CHECK-NEXT: [[TMP3]] = add i64 [[VEC_PHI]], [[TMP2]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP5]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
|
|
%0 = load i32, ptr %arrayidx, align 4
|
|
%conv = sext i32 %0 to i64
|
|
%add = add nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDLV
|
|
; FIXME: TailPredicate
|
|
define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
|
|
; CHECK-NEXT: [[TMP3]] = add i64 [[VEC_PHI]], [[TMP2]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[X]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP5]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.08
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i64
|
|
%add = add nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDLV
|
|
; FIXME: TailPredicate
|
|
define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
|
|
; CHECK-NEXT: [[TMP3]] = add i64 [[VEC_PHI]], [[TMP2]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i32 [[I_08]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP5]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.08
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i64
|
|
%add = add nuw nsw i64 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VADDV.u32
|
|
define i32 @add_i32_i32(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i32_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
|
|
; CHECK-NEXT: [[TMP2]] = add i32 [[VEC_PHI]], [[TMP1]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
|
|
%0 = load i32, ptr %arrayidx, align 4
|
|
%add = add nsw i32 %0, %r.07
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VADDV.u16
|
|
define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i32 [[VEC_PHI]], [[TMP3]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.08
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%add = add nsw i32 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VADDV.u16
|
|
define i32 @add_i8_i32(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i32 [[VEC_PHI]], [[TMP3]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp6 = icmp sgt i32 %n, 0
|
|
br i1 %cmp6, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.08
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%add = add nuw nsw i32 %r.07, %conv
|
|
%inc = add nuw nsw i32 %i.08, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VADDV.u16
|
|
define signext i16 @add_i16_i16(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i16_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]])
|
|
; CHECK-NEXT: [[TMP2]] = add i16 [[VEC_PHI]], [[TMP1]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.010
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%add = add i16 %0, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VADDV.u8
|
|
define signext i16 @add_i8_i16(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP1]], <16 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i16 [[VEC_PHI]], [[TMP3]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.010
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i16
|
|
%add = add i16 %r.09, %conv
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VADDV.u8
|
|
define zeroext i8 @add_i8_i8(ptr nocapture readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @add_i8_i8(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]])
|
|
; CHECK-NEXT: [[TMP2]] = add i8 [[VEC_PHI]], [[TMP1]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp7 = icmp sgt i32 %n, 0
|
|
br i1 %cmp7, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.08 = phi i8 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.09
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%add = add i8 %0, %r.08
|
|
%inc = add nuw nsw i32 %i.09, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i8 %r.0.lcssa
|
|
}
|
|
|
|
; Not vectorized
|
|
define i64 @mla_i64_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i64_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i64, ptr [[X:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i64, ptr [[Y:%.*]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[R_09]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i64, ptr %x, i32 %i.010
|
|
%0 = load i64, ptr %arrayidx, align 8
|
|
%arrayidx1 = getelementptr inbounds i64, ptr %y, i32 %i.010
|
|
%1 = load i64, ptr %arrayidx1, align 8
|
|
%mul = mul nsw i64 %1, %0
|
|
%add = add nsw i64 %mul, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VMLAL.u32
|
|
; FIXME: TailPredicate
|
|
define i64 @mla_i32_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i32_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i64 [[VEC_PHI]], [[TMP4]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[Y]], i32 [[I_010]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], [[TMP7]]
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.010
|
|
%0 = load i32, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, ptr %y, i32 %i.010
|
|
%1 = load i32, ptr %arrayidx1, align 4
|
|
%mul = mul nsw i32 %1, %0
|
|
%conv = sext i32 %mul to i64
|
|
%add = add nsw i64 %r.09, %conv
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLAL.u16
|
|
; FIXME: TailPredicate
|
|
define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i64 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[X]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[Y]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
|
|
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP10]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.012
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %i.012
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%conv3 = sext i32 %mul to i64
|
|
%add = add nsw i64 %r.011, %conv3
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLAL.u16
|
|
; FIXME: TailPredicate
|
|
define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
|
|
; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i64 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP9]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i32 [[I_012]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
|
|
; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP10]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = zext nneg i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.012
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, ptr %y, i32 %i.012
|
|
%1 = load i8, ptr %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i32
|
|
%mul = mul nuw nsw i32 %conv2, %conv
|
|
%conv3 = zext i32 %mul to i64
|
|
%add = add nuw nsw i64 %r.011, %conv3
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %r.0.lcssa
|
|
}
|
|
|
|
; 4x to use VMLA.i32
|
|
define i32 @mla_i32_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i32_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Y1:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP7]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %n, 0
|
|
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.09 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.010
|
|
%0 = load i32, ptr %arrayidx, align 4
|
|
%arrayidx1 = getelementptr inbounds i32, ptr %y, i32 %i.010
|
|
%1 = load i32, ptr %arrayidx1, align 4
|
|
%mul = mul nsw i32 %1, %0
|
|
%add = add nsw i32 %mul, %r.09
|
|
%inc = add nuw nsw i32 %i.010, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLA.i16
|
|
define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP2]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.011
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %i.011
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%add = add nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.i8
|
|
define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP2]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.011
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, ptr %y, i32 %i.011
|
|
%1 = load i8, ptr %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i32
|
|
%mul = mul nuw nsw i32 %conv2, %conv
|
|
%add = add nuw nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
; 8x to use VMLA.i16
|
|
define signext i16 @mla_i16_i16(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i16_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP1]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Y1:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP7]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i16 [[VEC_PHI]], [[TMP4]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
br i1 %cmp11, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.013
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %i.013
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%mul = mul i16 %1, %0
|
|
%add = add i16 %mul, %r.012
|
|
%inc = add nuw nsw i32 %i.013, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.i8
|
|
define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i16(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
|
|
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP4]], <16 x i16> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i16 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
br i1 %cmp11, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.013
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i16
|
|
%arrayidx1 = getelementptr inbounds i8, ptr %y, i32 %i.013
|
|
%1 = load i8, ptr %arrayidx1, align 1
|
|
%conv2 = zext i8 %1 to i16
|
|
%mul = mul nuw i16 %conv2, %conv
|
|
%add = add i16 %mul, %r.012
|
|
%inc = add nuw nsw i32 %i.013, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i16 %r.0.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.i8
|
|
define zeroext i8 @mla_i8_i8(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i8(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP10]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Y1:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP7]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i8 [[VEC_PHI]], [[TMP4]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp10 = icmp sgt i32 %n, 0
|
|
br i1 %cmp10, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.011 = phi i8 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.012
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%arrayidx1 = getelementptr inbounds i8, ptr %y, i32 %i.012
|
|
%1 = load i8, ptr %arrayidx1, align 1
|
|
%mul = mul i8 %1, %0
|
|
%add = add i8 %mul, %r.011
|
|
%inc = add nuw nsw i32 %i.012, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i8 %r.0.lcssa
|
|
}
|
|
|
|
; 8x as different types
|
|
define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 {
|
|
; CHECK-LABEL: @red_mla_ext_s8_s16_s32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP2]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
|
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[S_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
|
%s.010 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.011
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = sext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %B, i32 %i.011
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%add = add nsw i32 %mul, %s.010
|
|
%inc = add nuw i32 %i.011, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
|
|
|
|
for.cond.cleanup.loopexit: ; preds = %for.body
|
|
%add.lcssa = phi i32 [ %add, %for.body ]
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
|
|
ret i32 %s.0.lcssa
|
|
}
|
|
|
|
; 4x as different sext vs zext
|
|
define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 {
|
|
; CHECK-LABEL: @red_mla_ext_s16_u16_s64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[B1:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP11]], align 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[TMP4]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i64 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[S_010:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B1]], i32 [[I_011]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
|
|
; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[TMP10]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[MUL2:%.*]] = zext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_010]], [[MUL2]]
|
|
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_011]], 1
|
|
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[S_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
|
%s.010 = phi i64 [ %add, %for.body ], [ 0, %for.body.preheader ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %A, i32 %i.011
|
|
%0 = load i16, ptr %arrayidx, align 1
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %B, i32 %i.011
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = zext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%mul2 = zext i32 %mul to i64
|
|
%add = add nsw i64 %mul2, %s.010
|
|
%inc = add nuw i32 %i.011, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
|
|
|
|
for.cond.cleanup.loopexit: ; preds = %for.body
|
|
%add.lcssa = phi i64 [ %add, %for.body ]
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
|
%s.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
|
|
ret i64 %s.0.lcssa
|
|
}
|
|
|
|
; 4x as different sext vs zext
|
|
define i32 @red_mla_u8_s8_u32(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, i32 %n) #0 {
|
|
; CHECK-LABEL: @red_mla_u8_s8_u32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
|
|
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B1:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP9]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
|
|
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD2]] to <4 x i32>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[S_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9.not = icmp eq i32 %n, 0
|
|
br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
|
%s.010 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.011
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.011
|
|
%1 = load i8, ptr %arrayidx1, align 1
|
|
%conv2 = sext i8 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%add = add i32 %mul, %s.010
|
|
%inc = add nuw i32 %i.011, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
|
|
|
|
for.cond.cleanup.loopexit: ; preds = %for.body
|
|
%add.lcssa = phi i32 [ %add, %for.body ]
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
|
%s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
|
|
ret i32 %s.0.lcssa
|
|
}
|
|
|
|
; Make sure interleave group members feeding in-loop reductions can be handled.
|
|
define i32 @reduction_interleave_group(i32 %n, ptr %arr) #0 {
|
|
; CHECK-LABEL: @reduction_interleave_group(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[GUARD:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[GUARD]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 7
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4
|
|
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[DOTIDX:%.*]] = shl i32 [[INDEX]], 3
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[ARR:%.*]], i32 [[DOTIDX]]
|
|
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
|
|
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC1]])
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[VEC_PHI]], [[TMP6]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC]])
|
|
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP7]], [[TMP8]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[RED_PHI:%.*]] = phi i32 [ [[RED_2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IV]]
|
|
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr [[TMP11]], i32 4
|
|
; CHECK-NEXT: [[L_0:%.*]] = load i32, ptr [[GEP_0]], align 4
|
|
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i32 [[IV]]
|
|
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_1]], align 4
|
|
; CHECK-NEXT: [[RED_1:%.*]] = add i32 [[L_0]], [[RED_PHI]]
|
|
; CHECK-NEXT: [[RED_2]] = add i32 [[RED_1]], [[L_1]]
|
|
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 2
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP31:![0-9]+]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i32 [[RET_LCSSA]]
|
|
;
|
|
entry:
|
|
%guard = icmp sgt i32 %n, 0
|
|
br i1 %guard , label %for.body, label %exit
|
|
|
|
for.body: ; preds = %for.body.preheader, %for.body
|
|
%iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
|
|
%red.phi = phi i32 [ %red.2, %for.body ], [ 0, %entry ]
|
|
%add = or disjoint i32 %iv, 1
|
|
%gep.0 = getelementptr inbounds i32, ptr %arr, i32 %add
|
|
%l.0 = load i32, ptr %gep.0, align 4
|
|
%gep.1 = getelementptr inbounds i32, ptr %arr, i32 %iv
|
|
%l.1 = load i32, ptr %gep.1, align 4
|
|
%red.1 = add i32 %l.0, %red.phi
|
|
%red.2 = add i32 %red.1, %l.1
|
|
%iv.next = add nuw nsw i32 %iv, 2
|
|
%cmp = icmp slt i32 %iv.next, %n
|
|
br i1 %cmp, label %for.body, label %exit
|
|
|
|
exit:
|
|
%ret.lcssa = phi i32 [ 0, %entry ], [ %red.2, %for.body ]
|
|
ret i32 %ret.lcssa
|
|
}
|
|
|
|
; 16x to use VMLA.i8, same as mla_i8_i32 with multiple uses of the ext `add(mul(x, x))`
|
|
define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_i8_i32_multiuse(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw i32 [[N]], 15
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
|
; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %x, i32 %i.011
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv = zext i8 %0 to i32
|
|
%mul = mul nuw nsw i32 %conv, %conv
|
|
%add = add nuw nsw i32 %mul, %r.010
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond = icmp eq i32 %inc, %n
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i32 %r.0.lcssa
|
|
}
|
|
|
|
define i64 @mla_xx_sext_zext(ptr nocapture noundef readonly %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @mla_xx_sext_zext(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP5]] = add i64 [[VEC_PHI]], [[TMP4]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[S_0_LCSSA]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[S_010:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[X]], i32 [[I_011]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP7]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = zext nneg i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[S_010]], [[CONV3]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1
|
|
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
|
|
;
|
|
entry:
|
|
%cmp9 = icmp sgt i32 %n, 0
|
|
br i1 %cmp9, label %for.body, label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
|
|
ret i64 %s.0.lcssa
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%s.010 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.011
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%mul = mul nsw i32 %conv, %conv
|
|
%conv3 = zext nneg i32 %mul to i64
|
|
%add = add nuw nsw i64 %s.010, %conv3
|
|
%inc = add nuw nsw i32 %i.011, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 noundef %n) #0 {
|
|
; CHECK-LABEL: @mla_and_add_together_16_64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP16]])
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 8
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP5]] = add i64 [[VEC_PHI]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
|
|
; CHECK-NEXT: [[TMP7]] = add i32 [[VEC_PHI1]], [[TMP6]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: [[ADD6_LCSSA:%.*]] = phi i32 [ [[ADD6:%.*]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: [[CONV7:%.*]] = sext i32 [[ADD6_LCSSA]] to i64
|
|
; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[CONV7]], [[ADD_LCSSA]]
|
|
; CHECK-NEXT: ret i64 [[DIV]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_019:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[T_018:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[S_017:%.*]] = phi i32 [ [[ADD6]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[X]], i32 [[I_019]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = zext nneg i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[T_018]], [[CONV3]]
|
|
; CHECK-NEXT: [[ADD6]] = add nsw i32 [[S_017]], [[CONV]]
|
|
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_019]], 1
|
|
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
|
|
;
|
|
entry:
|
|
%cmp16 = icmp sgt i32 %n, 0
|
|
tail call void @llvm.assume(i1 %cmp16)
|
|
br label %for.body
|
|
|
|
for.cond.cleanup:
|
|
%conv7 = sext i32 %add6 to i64
|
|
%div = sdiv i64 %conv7, %add
|
|
ret i64 %div
|
|
|
|
for.body:
|
|
%i.019 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%t.018 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%s.017 = phi i32 [ %add6, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.019
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%mul = mul nsw i32 %conv, %conv
|
|
%conv3 = zext nneg i32 %mul to i64
|
|
%add = add nuw nsw i64 %t.018, %conv3
|
|
%add6 = add nsw i32 %s.017, %conv
|
|
%inc = add nuw nsw i32 %i.019, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define i64 @interleave_doublereduct_i16_i64(ptr %x, ptr %y, i32 %n) {
|
|
; CHECK-LABEL: @interleave_doublereduct_i16_i64(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP23:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP23]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[T_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD12:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: ret i64 [[T_0_LCSSA]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD13:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[T_024:%.*]] = phi i64 [ [[ADD12]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[X:%.*]], i32 [[I_025]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[Y:%.*]], i32 [[I_025]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
|
|
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[T_024]], [[CONV3]]
|
|
; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[I_025]], 1
|
|
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i16, ptr [[X]], i32 [[ADD4]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX5]], align 2
|
|
; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP2]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i16, ptr [[Y]], i32 [[ADD4]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2
|
|
; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
|
|
; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[CONV9]], [[CONV6]]
|
|
; CHECK-NEXT: [[CONV11:%.*]] = sext i32 [[MUL10]] to i64
|
|
; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]]
|
|
; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
|
|
;
|
|
entry:
|
|
%cmp23 = icmp sgt i32 %n, 0
|
|
br i1 %cmp23, label %for.body, label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
%t.0.lcssa = phi i64 [ 0, %entry ], [ %add12, %for.body ]
|
|
ret i64 %t.0.lcssa
|
|
|
|
for.body:
|
|
%i.025 = phi i32 [ %add13, %for.body ], [ 0, %entry ]
|
|
%t.024 = phi i64 [ %add12, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.025
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %i.025
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%conv3 = sext i32 %mul to i64
|
|
%add = add nsw i64 %t.024, %conv3
|
|
%add4 = or disjoint i32 %i.025, 1
|
|
%arrayidx5 = getelementptr inbounds i16, ptr %x, i32 %add4
|
|
%2 = load i16, ptr %arrayidx5, align 2
|
|
%conv6 = sext i16 %2 to i32
|
|
%arrayidx8 = getelementptr inbounds i16, ptr %y, i32 %add4
|
|
%3 = load i16, ptr %arrayidx8, align 2
|
|
%conv9 = sext i16 %3 to i32
|
|
%mul10 = mul nsw i32 %conv9, %conv6
|
|
%conv11 = sext i32 %mul10 to i64
|
|
%add12 = add nsw i64 %add, %conv11
|
|
%add13 = add nuw nsw i32 %i.025, 2
|
|
%cmp = icmp slt i32 %add13, %n
|
|
br i1 %cmp, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define i64 @test_std_q31(ptr %x, i32 %n) #0 {
|
|
; CHECK-LABEL: @test_std_q31(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP11]])
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
|
|
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[FOR_BODY1]] ]
|
|
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY1]] ]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], splat (i32 8)
|
|
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
|
|
; CHECK-NEXT: [[TMP4]] = add i64 [[VEC_PHI]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
|
|
; CHECK-NEXT: [[TMP8]] = add i64 [[VEC_PHI1]], [[TMP7]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP37:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[ADD:%.*]] = phi i64 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: [[ADD3:%.*]] = phi i64 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3]], [[ADD]]
|
|
; CHECK-NEXT: ret i64 [[DIV]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD1]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_013]]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP0]], 8
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SHR]] to i64
|
|
; CHECK-NEXT: [[ADD1]] = add nsw i64 [[S_014]], [[CONV]]
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV]]
|
|
; CHECK-NEXT: [[ADD5]] = add nuw nsw i64 [[MUL]], [[T_012]]
|
|
; CHECK-NEXT: [[ADD4]] = add nuw nsw i32 [[I_013]], 1
|
|
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[ADD4]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
|
|
;
|
|
entry:
|
|
%cmp11 = icmp sgt i32 %n, 0
|
|
tail call void @llvm.assume(i1 %cmp11)
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body
|
|
%div = sdiv i64 %add3, %add
|
|
ret i64 %div
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%s.014 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
|
%i.013 = phi i32 [ %add4, %for.body ], [ 0, %entry ]
|
|
%t.012 = phi i64 [ %add3, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.013
|
|
%0 = load i32, ptr %arrayidx, align 4
|
|
%shr = ashr i32 %0, 8
|
|
%conv = sext i32 %shr to i64
|
|
%add = add nsw i64 %s.014, %conv
|
|
%mul = mul nsw i64 %conv, %conv
|
|
%add3 = add nuw nsw i64 %mul, %t.012
|
|
%add4 = add nuw nsw i32 %i.013, 1
|
|
%exitcond.not = icmp eq i32 %add4, %n
|
|
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 {
|
|
; CHECK-LABEL: @test_fir_q15(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP23:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP23]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 15
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -8
|
|
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[OFFSET_IDX]]
|
|
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
|
|
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[OFFSET_IDX]]
|
|
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2
|
|
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP7]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
|
|
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[VEC_PHI]], [[TMP9]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i64>
|
|
; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <8 x i64> [[TMP11]], [[TMP12]]
|
|
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP13]])
|
|
; CHECK-NEXT: [[TMP16]] = add i64 [[TMP10]], [[TMP15]]
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
|
|
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD12:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
|
|
; CHECK-NEXT: ret i64 [[S_0_LCSSA]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD13:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[S_024:%.*]] = phi i64 [ [[ADD12]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_025]]
|
|
; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[Y]], i32 [[I_025]]
|
|
; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
|
|
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP19]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64
|
|
; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[S_024]], [[CONV3]]
|
|
; CHECK-NEXT: [[ADD4:%.*]] = or disjoint i32 [[I_025]], 1
|
|
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[ADD4]]
|
|
; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX5]], align 2
|
|
; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP20]] to i32
|
|
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, ptr [[Y]], i32 [[ADD4]]
|
|
; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2
|
|
; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP21]] to i32
|
|
; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[CONV9]], [[CONV6]]
|
|
; CHECK-NEXT: [[CONV11:%.*]] = sext i32 [[MUL10]] to i64
|
|
; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]]
|
|
; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]]
|
|
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP40:![0-9]+]]
|
|
;
|
|
entry:
|
|
%cmp23 = icmp sgt i32 %n, 0
|
|
br i1 %cmp23, label %for.body, label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
%s.0.lcssa = phi i64 [ 0, %entry ], [ %add12, %for.body ]
|
|
ret i64 %s.0.lcssa
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.025 = phi i32 [ %add13, %for.body ], [ 0, %entry ]
|
|
%s.024 = phi i64 [ %add12, %for.body ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i16, ptr %x, i32 %i.025
|
|
%0 = load i16, ptr %arrayidx, align 2
|
|
%conv = sext i16 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %i.025
|
|
%1 = load i16, ptr %arrayidx1, align 2
|
|
%conv2 = sext i16 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%conv3 = sext i32 %mul to i64
|
|
%add = add nsw i64 %s.024, %conv3
|
|
%add4 = or disjoint i32 %i.025, 1
|
|
%arrayidx5 = getelementptr inbounds i16, ptr %x, i32 %add4
|
|
%2 = load i16, ptr %arrayidx5, align 2
|
|
%conv6 = sext i16 %2 to i32
|
|
%arrayidx8 = getelementptr inbounds i16, ptr %y, i32 %add4
|
|
%3 = load i16, ptr %arrayidx8, align 2
|
|
%conv9 = sext i16 %3 to i32
|
|
%mul10 = mul nsw i32 %conv9, %conv6
|
|
%conv11 = sext i32 %mul10 to i64
|
|
%add12 = add nsw i64 %add, %conv11
|
|
%add13 = add nuw nsw i32 %i.025, 2
|
|
%cmp = icmp slt i32 %add13, %n
|
|
br i1 %cmp, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
|
|
attributes #0 = { "target-features"="+mve" }
|