In isOutsideLoopWorkProfitable function, there are two places where only the runtime check cost (RtC) should be used, but incorrectly included the costs of middle blocks and early-exit blocks. 1. VectorizeMemoryCheckThreshold comparison for interleaving-only 2. Minimum trip count that bounds runtime check overhead, i.e. MinTC2 calculation This results in an overly conservative minimum profitable trip count. This patch separates the runtime check cost from the total overhead cost, and uses only RtC for VectorizeMemoryCheckThreshold comparison and the MinTC2 calculation.
872 lines
51 KiB
LLVM
872 lines
51 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
|
|
; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=X86
|
|
; RUN: opt -passes=loop-vectorize -mattr=+avx512f -S < %s 2>&1 | FileCheck %s --check-prefix=AVX512
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
|
|
target triple = "x86_64-unknown-linux-gnu"
|
|
|
|
define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
|
|
; X86-LABEL: define i32 @simple_csa_int_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
|
|
; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; X86: [[VECTOR_PH]]:
|
|
; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
|
|
; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
|
|
; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; X86-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
|
|
; X86-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
|
|
; X86-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
|
|
; X86-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
|
|
; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
|
|
; X86-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; X86: [[MIDDLE_BLOCK]]:
|
|
; X86-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 -1)
|
|
; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; X86: [[SCALAR_PH]]:
|
|
; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; X86-NEXT: br label %[[LOOP1:.*]]
|
|
; X86: [[LOOP1]]:
|
|
; X86-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[LD_ADDR1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV1]]
|
|
; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR1]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
|
|
; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i32 @simple_csa_int_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
|
|
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; AVX512: [[VECTOR_PH]]:
|
|
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
|
|
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0
|
|
; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
|
|
; AVX512-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; AVX512: [[VECTOR_BODY]]:
|
|
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
|
|
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4
|
|
; AVX512-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; AVX512-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]]
|
|
; AVX512-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
|
|
; AVX512-NEXT: [[TMP5]] = select i1 [[TMP4]], <16 x i1> [[TMP2]], <16 x i1> [[TMP0]]
|
|
; AVX512-NEXT: [[TMP6]] = select i1 [[TMP4]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]]
|
|
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
|
|
; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; AVX512: [[MIDDLE_BLOCK]]:
|
|
; AVX512-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP6]], <16 x i1> [[TMP5]], i32 -1)
|
|
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; AVX512: [[SCALAR_PH]]:
|
|
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
|
|
; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
|
|
%ld = load i32, ptr %ld.addr, align 4
|
|
%select.cmp = icmp slt i32 %a, %ld
|
|
%select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret i32 %select.data
|
|
}
|
|
|
|
define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) {
|
|
; X86-LABEL: define ptr @simple_csa_ptr_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; X86-NEXT: ret ptr [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define ptr @simple_csa_ptr_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; AVX512-NEXT: ret ptr [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi ptr [ %init, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds ptr, ptr %data, i64 %iv
|
|
%ld = load ptr, ptr %ld.addr, align 4
|
|
%ld.i64 = ptrtoint ptr %ld to i64
|
|
%select.cmp = icmp slt i64 %a, %ld.i64
|
|
%select.data = select i1 %select.cmp, ptr %ld, ptr %data.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret ptr %select.data
|
|
}
|
|
|
|
define float @simple_csa_float_select(i64 %N, ptr %data, float %a) {
|
|
; X86-LABEL: define float @simple_csa_float_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; X86-NEXT: ret float [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define float @simple_csa_float_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; AVX512-NEXT: ret float [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi float [ -1.0, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds float, ptr %data, i64 %iv
|
|
%ld = load float, ptr %ld.addr, align 4
|
|
%select.cmp = fcmp olt float %a, %ld
|
|
%select.data = select i1 %select.cmp, float %ld, float %data.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret float %select.data
|
|
}
|
|
|
|
define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) {
|
|
; X86-LABEL: define i32 @multi_user_csa_int_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; X86-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]]
|
|
; X86-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i32 @multi_user_csa_int_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]]
|
|
; AVX512-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
|
|
%ld = load i32, ptr %ld.addr, align 4
|
|
%select.cmp = icmp slt i32 %a, %ld
|
|
%select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
|
|
%res.addr = getelementptr inbounds i32, ptr %results, i64 %iv
|
|
store i32 %select.data, ptr %res.addr, align 4
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret i32 %select.data
|
|
}
|
|
|
|
|
|
define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) {
|
|
; X86-LABEL: define i32 @multi_use_cmp_for_csa_int_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; X86-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; X86-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ]
|
|
; X86-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32
|
|
; X86-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]]
|
|
; X86-NEXT: ret i32 [[RES]]
|
|
;
|
|
; AVX512-LABEL: define i32 @multi_use_cmp_for_csa_int_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32
|
|
; AVX512-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]]
|
|
; AVX512-NEXT: ret i32 [[RES]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
|
|
%idx.phi = phi i64 [ -1, %entry ], [ %select.idx, %loop ]
|
|
%ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
|
|
%ld = load i32, ptr %ld.addr, align 4
|
|
%select.cmp = icmp slt i32 %a, %ld
|
|
%select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
|
|
%select.idx = select i1 %select.cmp, i64 %iv, i64 %idx.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
%idx = trunc i64 %select.idx to i32
|
|
%res = add i32 %idx, %select.data
|
|
ret i32 %res
|
|
}
|
|
|
|
|
|
define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i32 %a, i32 %b) {
|
|
; X86-LABEL: define i32 @chained_select_for_csa_int_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
|
|
; X86-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]]
|
|
; X86-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]]
|
|
; X86-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]]
|
|
; X86-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i32 @chained_select_for_csa_int_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]]
|
|
; AVX512-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
|
|
%ld1.addr = getelementptr inbounds i32, ptr %data1, i64 %iv
|
|
%ld1 = load i32, ptr %ld1.addr, align 4
|
|
%select.cmp1 = icmp slt i32 %a, %ld1
|
|
%select.ld1 = select i1 %select.cmp1, i32 %ld1, i32 %data.phi
|
|
%ld2.addr = getelementptr inbounds i32, ptr %data2, i64 %iv
|
|
%ld2 = load i32, ptr %ld2.addr, align 4
|
|
%select.cmp2 = icmp sgt i32 %b, %ld2
|
|
%select.data = select i1 %select.cmp2, i32 %ld2, i32 %select.ld1
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret i32 %select.data
|
|
}
|
|
|
|
define i32 @csa_with_extra_use_of_select(i64 %N, ptr readonly %data, ptr noalias %out, i32 %a) {
|
|
; X86-LABEL: define i32 @csa_with_extra_use_of_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; X86-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]]
|
|
; X86-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i32 @csa_with_extra_use_of_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]]
|
|
; AVX512-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ]
|
|
; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
|
|
%ld = load i32, ptr %ld.addr, align 4
|
|
%select.cmp = icmp slt i32 %a, %ld
|
|
%select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
|
|
%st.addr = getelementptr inbounds i32, ptr %out, i64 %iv
|
|
store i32 %select.data, ptr %st.addr, align 4
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret i32 %select.data
|
|
}
|
|
|
|
;; Add more work to the loop besides the CSA to check cost modelling.
|
|
define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %threshold) {
|
|
; X86-LABEL: define i32 @int_select_with_extra_arith_payload(
|
|
; X86-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
|
|
; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; X86: [[VECTOR_PH]]:
|
|
; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
|
|
; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[THRESHOLD]], i64 0
|
|
; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
|
|
; X86-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 4
|
|
; X86-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[WIDE_LOAD]], splat (i32 13)
|
|
; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
|
|
; X86-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
|
|
; X86-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_LOAD1]], splat (i32 5)
|
|
; X86-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
|
|
; X86-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
|
|
; X86-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
|
|
; X86-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; X86-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
|
|
; X86-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
|
|
; X86-NEXT: [[TMP10]] = select i1 [[TMP9]], <4 x i1> [[TMP7]], <4 x i1> [[TMP0]]
|
|
; X86-NEXT: [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
|
|
; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
|
|
; X86-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
|
|
; X86: [[MIDDLE_BLOCK]]:
|
|
; X86-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1)
|
|
; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; X86: [[SCALAR_PH]]:
|
|
; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; X86-NEXT: br label %[[LOOP1:.*]]
|
|
; X86: [[LOOP1]]:
|
|
; X86-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[A_ADDR1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
|
|
; X86-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR1]], align 4
|
|
; X86-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13
|
|
; X86-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
|
|
; X86-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
|
|
; X86-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5
|
|
; X86-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]]
|
|
; X86-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV1]]
|
|
; X86-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]]
|
|
; X86-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP1]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
|
|
; X86-NEXT: ret i32 [[SELECT_A_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i32 @int_select_with_extra_arith_payload(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
|
|
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; AVX512: [[VECTOR_PH]]:
|
|
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
|
|
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[THRESHOLD]], i64 0
|
|
; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
|
|
; AVX512-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; AVX512: [[VECTOR_BODY]]:
|
|
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
|
|
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4
|
|
; AVX512-NEXT: [[TMP2:%.*]] = mul <16 x i32> [[WIDE_LOAD]], splat (i32 13)
|
|
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
|
|
; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
|
|
; AVX512-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[WIDE_LOAD1]], splat (i32 5)
|
|
; AVX512-NEXT: [[TMP5:%.*]] = add <16 x i32> [[TMP2]], [[TMP4]]
|
|
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
|
|
; AVX512-NEXT: store <16 x i32> [[TMP5]], ptr [[TMP6]], align 4
|
|
; AVX512-NEXT: [[TMP7:%.*]] = icmp slt <16 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; AVX512-NEXT: [[TMP8:%.*]] = freeze <16 x i1> [[TMP7]]
|
|
; AVX512-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP8]])
|
|
; AVX512-NEXT: [[TMP10]] = select i1 [[TMP9]], <16 x i1> [[TMP7]], <16 x i1> [[TMP0]]
|
|
; AVX512-NEXT: [[TMP11]] = select i1 [[TMP9]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]]
|
|
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
|
|
; AVX512-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
|
; AVX512: [[MIDDLE_BLOCK]]:
|
|
; AVX512-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP11]], <16 x i1> [[TMP10]], i32 -1)
|
|
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; AVX512: [[SCALAR_PH]]:
|
|
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
|
|
; AVX512-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13
|
|
; AVX512-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
|
|
; AVX512-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5
|
|
; AVX512-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]]
|
|
; AVX512-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
|
|
; AVX512-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]]
|
|
; AVX512-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
|
|
; AVX512-NEXT: ret i32 [[SELECT_A_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%A.phi = phi i32 [ -1, %entry ], [ %select.A, %loop ]
|
|
%A.addr = getelementptr inbounds i32, ptr %A, i64 %iv
|
|
%ld.A = load i32, ptr %A.addr, align 4
|
|
%mul.A = mul i32 %ld.A, 13
|
|
%B.addr = getelementptr inbounds i32, ptr %B, i64 %iv
|
|
%ld.B = load i32, ptr %B.addr, align 4
|
|
%mul.B = mul i32 %ld.B, 5
|
|
%add = add i32 %mul.A, %mul.B
|
|
%C.addr = getelementptr inbounds i32, ptr %C, i64 %iv
|
|
store i32 %add, ptr %C.addr, align 4
|
|
%select.cmp = icmp slt i32 %threshold, %ld.A
|
|
%select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret i32 %select.A
|
|
}
|
|
|
|
define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) {
|
|
; X86-LABEL: define i8 @simple_csa_byte_select(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i8 [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
|
|
; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; X86: [[VECTOR_PH]]:
|
|
; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
|
|
; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
|
|
; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ splat (i8 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[TMP2:%.*]] = icmp slt <16 x i8> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; X86-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]]
|
|
; X86-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
|
|
; X86-NEXT: [[TMP5]] = select i1 [[TMP4]], <16 x i1> [[TMP2]], <16 x i1> [[TMP0]]
|
|
; X86-NEXT: [[TMP6]] = select i1 [[TMP4]], <16 x i8> [[WIDE_LOAD]], <16 x i8> [[VEC_PHI]]
|
|
; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 16
|
|
; X86-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
|
|
; X86: [[MIDDLE_BLOCK]]:
|
|
; X86-NEXT: [[TMP8:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> [[TMP6]], <16 x i1> [[TMP5]], i8 -1)
|
|
; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; X86: [[SCALAR_PH]]:
|
|
; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; X86-NEXT: br label %[[LOOP1:.*]]
|
|
; X86: [[LOOP1]]:
|
|
; X86-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[LD_ADDR1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV1]]
|
|
; X86-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR1]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i8 [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
|
|
; X86-NEXT: ret i8 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i8 @simple_csa_byte_select(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i8 [[A:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 128
|
|
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; AVX512: [[VECTOR_PH]]:
|
|
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 64
|
|
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i8> poison, i8 [[A]], i64 0
|
|
; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i8> [[BROADCAST_SPLATINSERT]], <64 x i8> poison, <64 x i32> zeroinitializer
|
|
; AVX512-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; AVX512: [[VECTOR_BODY]]:
|
|
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ splat (i8 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP0:%.*]] = phi <64 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[INDEX]]
|
|
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, ptr [[TMP1]], align 4
|
|
; AVX512-NEXT: [[TMP2:%.*]] = icmp slt <64 x i8> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; AVX512-NEXT: [[TMP3:%.*]] = freeze <64 x i1> [[TMP2]]
|
|
; AVX512-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> [[TMP3]])
|
|
; AVX512-NEXT: [[TMP5]] = select i1 [[TMP4]], <64 x i1> [[TMP2]], <64 x i1> [[TMP0]]
|
|
; AVX512-NEXT: [[TMP6]] = select i1 [[TMP4]], <64 x i8> [[WIDE_LOAD]], <64 x i8> [[VEC_PHI]]
|
|
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
|
|
; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
|
|
; AVX512: [[MIDDLE_BLOCK]]:
|
|
; AVX512-NEXT: [[TMP8:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v64i8(<64 x i8> [[TMP6]], <64 x i1> [[TMP5]], i8 -1)
|
|
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; AVX512: [[SCALAR_PH]]:
|
|
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i8 [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
|
|
; AVX512-NEXT: ret i8 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i8 [ -1, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds i8, ptr %data, i64 %iv
|
|
%ld = load i8, ptr %ld.addr, align 4
|
|
%select.cmp = icmp slt i8 %a, %ld
|
|
%select.data = select i1 %select.cmp, i8 %ld, i8 %data.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
ret i8 %select.data
|
|
}
|
|
|
|
define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
|
|
; X86-LABEL: define i32 @simple_csa_int_select_use_interleave(
|
|
; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
|
|
; X86-NEXT: [[ENTRY:.*]]:
|
|
; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
|
|
; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; X86: [[VECTOR_PH]]:
|
|
; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
|
|
; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
|
|
; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
|
|
; X86-NEXT: br label %[[LOOP:.*]]
|
|
; X86: [[LOOP]]:
|
|
; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
|
|
; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; X86-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[LD_ADDR]], align 4
|
|
; X86-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; X86-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
|
|
; X86-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
|
|
; X86-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
|
|
; X86-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
|
|
; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
|
|
; X86-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
|
|
; X86: [[MIDDLE_BLOCK]]:
|
|
; X86-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 -1)
|
|
; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; X86: [[SCALAR_PH]]:
|
|
; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; X86-NEXT: br label %[[LOOP1:.*]]
|
|
; X86: [[LOOP1]]:
|
|
; X86-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP1]] ]
|
|
; X86-NEXT: [[LD_ADDR1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV1]]
|
|
; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR1]], align 4
|
|
; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
|
|
; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
|
|
; X86: [[EXIT]]:
|
|
; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP1]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
|
|
; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
; AVX512-LABEL: define i32 @simple_csa_int_select_use_interleave(
|
|
; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
|
|
; AVX512-NEXT: [[ENTRY:.*]]:
|
|
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
|
|
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; AVX512: [[VECTOR_PH]]:
|
|
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
|
|
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
|
|
; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0
|
|
; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
|
|
; AVX512-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; AVX512: [[VECTOR_BODY]]:
|
|
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
|
|
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
|
|
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4
|
|
; AVX512-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
|
|
; AVX512-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]]
|
|
; AVX512-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
|
|
; AVX512-NEXT: [[TMP5]] = select i1 [[TMP4]], <16 x i1> [[TMP2]], <16 x i1> [[TMP0]]
|
|
; AVX512-NEXT: [[TMP6]] = select i1 [[TMP4]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]]
|
|
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
|
|
; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
|
|
; AVX512: [[MIDDLE_BLOCK]]:
|
|
; AVX512-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP6]], <16 x i1> [[TMP5]], i32 -1)
|
|
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
|
|
; AVX512: [[SCALAR_PH]]:
|
|
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
|
|
; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
|
|
; AVX512-NEXT: br label %[[LOOP:.*]]
|
|
; AVX512: [[LOOP]]:
|
|
; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ]
|
|
; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
|
|
; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
|
|
; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]]
|
|
; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]]
|
|
; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
|
|
; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
|
|
; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
|
|
; AVX512: [[EXIT]]:
|
|
; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
|
|
; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ]
|
|
%ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv
|
|
%ld = load i32, ptr %ld.addr, align 4
|
|
%select.cmp = icmp slt i32 %a, %ld
|
|
%select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exit.cmp = icmp eq i64 %iv.next, %N
|
|
br i1 %exit.cmp, label %exit, label %loop, !llvm.loop !1
|
|
|
|
exit:
|
|
ret i32 %select.data
|
|
}
|
|
|
|
!1 = distinct !{!1, !2, !3}
|
|
!2 = !{!"llvm.loop.interleave.count", i32 2}
|
|
!3 = !{!"llvm.loop.vectorize.enable", i1 true}
|