In the initial patch for FMAD, potential FMAD nodes were completely excluded from the reduction analysis for the smaller patch. But it may cause regressions. This patch adds better detection of scalar FMAD reduction operations and tries to correctly calculate the costs of the FMAD reduction operations (also, excluding the costs of the scalar fmuls) and split reduction operations, combined with regular FMADs. Fixed the handling for reduced values with many uses. Reviewers: RKSimon, gregbedwell, hiraditya Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/152787
808 lines
38 KiB
LLVM
808 lines
38 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
|
|
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
|
|
|
|
define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
|
|
; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], splat (i32 10)
|
|
; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
|
|
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], splat (i32 10)
|
|
; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
|
|
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
|
|
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
|
|
%l.src.0 = load i32, ptr %gep.src.0, align 4
|
|
%mul.0 = mul nsw i32 %l.src.0, 10
|
|
|
|
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
|
|
%l.src.1 = load i32, ptr %gep.src.1, align 4
|
|
%mul.1 = mul nsw i32 %l.src.1, 10
|
|
|
|
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
|
|
%l.src.2 = load i32, ptr %gep.src.2, align 4
|
|
%mul.2 = mul nsw i32 %l.src.2, 10
|
|
|
|
store i32 %mul.0, ptr %dst
|
|
|
|
%dst.1 = getelementptr i32, ptr %dst, i32 1
|
|
store i32 %mul.1, ptr %dst.1
|
|
|
|
%dst.2 = getelementptr i32, ptr %dst, i32 2
|
|
store i32 %mul.2, ptr %dst.2
|
|
|
|
ret void
|
|
}
|
|
|
|
; Should no be vectorized with a undef/poison element as padding, as
|
|
; division by undef/poison may cause UB. Must use VL predication or
|
|
; masking instead, where RISCV wins.
|
|
define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) {
|
|
; NON-POW2-LABEL: @v3_load_i32_udiv_by_constant_store(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = udiv <3 x i32> splat (i32 10), [[TMP0]]
|
|
; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @v3_load_i32_udiv_by_constant_store(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]]
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]]
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]]
|
|
; POW2-ONLY-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
|
|
; POW2-ONLY-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
|
|
; POW2-ONLY-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4
|
|
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
|
|
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
|
|
%l.src.0 = load i32, ptr %gep.src.0, align 4
|
|
%mul.0 = udiv i32 10, %l.src.0
|
|
|
|
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
|
|
%l.src.1 = load i32, ptr %gep.src.1, align 4
|
|
%mul.1 = udiv i32 10, %l.src.1
|
|
|
|
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
|
|
%l.src.2 = load i32, ptr %gep.src.2, align 4
|
|
%mul.2 = udiv i32 10, %l.src.2
|
|
|
|
store i32 %mul.0, ptr %dst
|
|
|
|
%dst.1 = getelementptr i32, ptr %dst, i32 1
|
|
store i32 %mul.1, ptr %dst.1
|
|
|
|
%dst.2 = getelementptr i32, ptr %dst, i32 2
|
|
store i32 %mul.2, ptr %dst.2
|
|
|
|
ret void
|
|
}
|
|
|
|
|
|
|
|
define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
|
|
; NON-POW2-LABEL: @v3_load_i32_mul_store(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
|
|
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @v3_load_i32_mul_store(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
|
|
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
|
|
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
|
|
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
|
|
; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
|
|
%l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
|
|
%gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
|
|
%l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
|
|
%mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
|
|
|
|
%gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
|
|
%l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
|
|
%gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
|
|
%l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
|
|
%mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
|
|
|
|
%gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
|
|
%l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
|
|
%gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
|
|
%l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
|
|
%mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
|
|
|
|
store i32 %mul.0, ptr %dst
|
|
|
|
%dst.1 = getelementptr i32, ptr %dst, i32 1
|
|
store i32 %mul.1, ptr %dst.1
|
|
|
|
%dst.2 = getelementptr i32, ptr %dst, i32 2
|
|
store i32 %mul.2, ptr %dst.2
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
|
|
; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], splat (i32 9)
|
|
; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
|
|
; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
|
|
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
|
|
; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], splat (i32 9)
|
|
; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
|
|
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
|
|
; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
|
|
%l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
|
|
%gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
|
|
%l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
|
|
%mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
|
|
%add.0 = add i32 %mul.0, 9
|
|
|
|
%gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
|
|
%l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
|
|
%gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
|
|
%l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
|
|
%mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
|
|
%add.1 = add i32 %mul.1, 9
|
|
|
|
%gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
|
|
%l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
|
|
%gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
|
|
%l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
|
|
%mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
|
|
%add.2 = add i32 %mul.2, 9
|
|
|
|
store i32 %add.0, ptr %dst
|
|
|
|
%dst.1 = getelementptr i32, ptr %dst, i32 1
|
|
store i32 %add.1, ptr %dst.1
|
|
|
|
%dst.2 = getelementptr i32, ptr %dst, i32 2
|
|
store i32 %add.2, ptr %dst.2
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
|
|
; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01)
|
|
; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
|
|
; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
|
|
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01)
|
|
; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
|
|
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
|
|
; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
|
|
%l.src.0 = load float , ptr %gep.src.0, align 4
|
|
%fadd.0 = fadd float %l.src.0, 10.0
|
|
|
|
%gep.src.1 = getelementptr inbounds float , ptr %src, i32 1
|
|
%l.src.1 = load float, ptr %gep.src.1, align 4
|
|
%fadd.1 = fadd float %l.src.1, 10.0
|
|
|
|
%gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
|
|
%l.src.2 = load float, ptr %gep.src.2, align 4
|
|
%fadd.2 = fadd float %l.src.2, 10.0
|
|
|
|
store float %fadd.0, ptr %dst
|
|
|
|
%dst.1 = getelementptr float, ptr %dst, i32 1
|
|
store float %fadd.1, ptr %dst.1
|
|
|
|
%dst.2 = getelementptr float, ptr %dst, i32 2
|
|
store float %fadd.2, ptr %dst.2
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @phi_store3(ptr %dst) {
|
|
; NON-POW2-LABEL: @phi_store3(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: br label [[EXIT:%.*]]
|
|
; NON-POW2: invoke.cont8.loopexit:
|
|
; NON-POW2-NEXT: br label [[EXIT]]
|
|
; NON-POW2: exit:
|
|
; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
|
|
; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @phi_store3(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: br label [[EXIT:%.*]]
|
|
; POW2-ONLY: invoke.cont8.loopexit:
|
|
; POW2-ONLY-NEXT: br label [[EXIT]]
|
|
; POW2-ONLY: exit:
|
|
; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
|
|
; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
|
|
; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
|
|
; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
|
|
; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %exit
|
|
|
|
invoke.cont8.loopexit: ; No predecessors!
|
|
br label %exit
|
|
|
|
exit:
|
|
%p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ]
|
|
%p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ]
|
|
%p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ]
|
|
|
|
%dst.1 = getelementptr i32, ptr %dst, i32 1
|
|
%dst.2 = getelementptr i32, ptr %dst, i32 2
|
|
|
|
store i32 %p.0, ptr %dst, align 4
|
|
store i32 %p.1, ptr %dst.1, align 4
|
|
store i32 %p.2, ptr %dst.2, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @store_try_reorder(ptr %dst) {
|
|
; NON-POW2-LABEL: @store_try_reorder(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @store_try_reorder(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887:%.*]], align 4
|
|
; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0
|
|
; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[ARRAYIDX_I1887]], i64 2
|
|
; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%add = add i32 0, 0
|
|
store i32 %add, ptr %dst, align 4
|
|
%add207 = sub i32 0, 0
|
|
%arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1
|
|
store i32 %add207, ptr %arrayidx.i1887, align 4
|
|
%add216 = sub i32 0, 0
|
|
%arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2
|
|
store i32 %add216, ptr %arrayidx.i1891, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @vec3_fpext_cost(ptr %Colour, float %0) {
|
|
; NON-POW2-LABEL: @vec3_fpext_cost(
|
|
; NON-POW2-NEXT: entry:
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = fpext <3 x float> [[TMP2]] to <3 x double>
|
|
; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> [[TMP3]], <3 x double> zeroinitializer, <3 x double> zeroinitializer)
|
|
; NON-POW2-NEXT: [[TMP5:%.*]] = fptrunc <3 x double> [[TMP4]] to <3 x float>
|
|
; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR:%.*]], align 4
|
|
; NON-POW2-NEXT: ret void
|
|
;
|
|
; POW2-ONLY-LABEL: @vec3_fpext_cost(
|
|
; POW2-ONLY-NEXT: entry:
|
|
; POW2-ONLY-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
|
|
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
|
|
; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
|
|
; POW2-ONLY-NEXT: [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
|
|
; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4
|
|
; POW2-ONLY-NEXT: [[CONV78:%.*]] = fpext float [[TMP0]] to double
|
|
; POW2-ONLY-NEXT: [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00)
|
|
; POW2-ONLY-NEXT: [[CONV82:%.*]] = fptrunc double [[TMP6]] to float
|
|
; POW2-ONLY-NEXT: store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
|
|
; POW2-ONLY-NEXT: ret void
|
|
;
|
|
entry:
|
|
%arrayidx72 = getelementptr float, ptr %Colour, i64 1
|
|
%arrayidx80 = getelementptr float, ptr %Colour, i64 2
|
|
%conv62 = fpext float %0 to double
|
|
%1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00)
|
|
%conv66 = fptrunc double %1 to float
|
|
store float %conv66, ptr %Colour, align 4
|
|
%conv70 = fpext float %0 to double
|
|
%2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00)
|
|
%conv74 = fptrunc double %2 to float
|
|
store float %conv74, ptr %arrayidx72, align 4
|
|
%conv78 = fpext float %0 to double
|
|
%3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00)
|
|
%conv82 = fptrunc double %3 to float
|
|
store float %conv82, ptr %arrayidx80, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @fpext_scatter(ptr %dst, double %conv) {
|
|
; CHECK-LABEL: @fpext_scatter(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float
|
|
; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
|
|
; CHECK-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
|
|
; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4
|
|
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
|
|
; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX37]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%conv25 = fptrunc double %conv to float
|
|
%Lengths = getelementptr float, ptr %dst, i64 0
|
|
store float %conv25, ptr %Lengths, align 4
|
|
%arrayidx32 = getelementptr float, ptr %dst, i64 1
|
|
store float %conv25, ptr %arrayidx32, align 4
|
|
%arrayidx37 = getelementptr float, ptr %dst, i64 2
|
|
store float %conv25, ptr %arrayidx37, align 4
|
|
ret void
|
|
}
|
|
|
|
define i32 @reduce_add(ptr %src) {
|
|
; CHECK-LABEL: @reduce_add(
|
|
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
|
|
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
|
|
; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
|
|
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
|
|
; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
|
|
; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[L_SRC_0]], [[L_SRC_1]]
|
|
; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[L_SRC_2]]
|
|
; CHECK-NEXT: ret i32 [[ADD_1]]
|
|
;
|
|
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
|
|
%l.src.0 = load i32, ptr %gep.src.0, align 4
|
|
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
|
|
%l.src.1 = load i32, ptr %gep.src.1, align 4
|
|
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
|
|
%l.src.2 = load i32, ptr %gep.src.2, align 4
|
|
|
|
%add.0 = add i32 %l.src.0, %l.src.1
|
|
%add.1 = add i32 %add.0, %l.src.2
|
|
ret i32 %add.1
|
|
}
|
|
|
|
define float @reduce_fadd(ptr %src) {
|
|
; NON-POW2-LABEL: @reduce_fadd(
|
|
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP1]])
|
|
; NON-POW2-NEXT: ret float [[TMP2]]
|
|
;
|
|
; POW2-ONLY-LABEL: @reduce_fadd(
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
|
|
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]]
|
|
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]]
|
|
; POW2-ONLY-NEXT: ret float [[ADD_1]]
|
|
;
|
|
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
|
|
%l.src.0 = load float, ptr %gep.src.0, align 4
|
|
%gep.src.1 = getelementptr inbounds float, ptr %src, i32 1
|
|
%l.src.1 = load float, ptr %gep.src.1, align 4
|
|
%gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
|
|
%l.src.2 = load float, ptr %gep.src.2, align 4
|
|
|
|
%add.0 = fadd fast float %l.src.0, %l.src.1
|
|
%add.1 = fadd fast float %add.0, %l.src.2
|
|
ret float %add.1
|
|
}
|
|
|
|
define i32 @reduce_add_after_mul(ptr %src) {
|
|
; NON-POW2-LABEL: @reduce_add_after_mul(
|
|
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], splat (i32 10)
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]])
|
|
; NON-POW2-NEXT: ret i32 [[TMP3]]
|
|
;
|
|
; POW2-ONLY-LABEL: @reduce_add_after_mul(
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
|
|
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
|
|
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
|
|
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
|
|
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
|
|
;
|
|
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
|
|
%l.src.0 = load i32, ptr %gep.src.0, align 4
|
|
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
|
|
%l.src.1 = load i32, ptr %gep.src.1, align 4
|
|
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
|
|
%l.src.2 = load i32, ptr %gep.src.2, align 4
|
|
|
|
%mul.0 = mul nsw i32 %l.src.0, 10
|
|
%mul.1 = mul nsw i32 %l.src.1, 10
|
|
%mul.2 = mul nsw i32 %l.src.2, 10
|
|
|
|
%add.0 = add i32 %mul.0, %mul.1
|
|
%add.1 = add i32 %add.0, %mul.2
|
|
ret i32 %add.1
|
|
}
|
|
|
|
define i32 @dot_product_i32(ptr %a, ptr %b) {
|
|
; NON-POW2-LABEL: @dot_product_i32(
|
|
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
|
|
; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
|
|
; NON-POW2-NEXT: ret i32 [[TMP4]]
|
|
;
|
|
; POW2-ONLY-LABEL: @dot_product_i32(
|
|
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
|
|
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
|
|
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
|
|
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
|
|
;
|
|
%gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
|
|
%l.a.0 = load i32, ptr %gep.a.0, align 4
|
|
%gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
|
|
%l.a.1 = load i32, ptr %gep.a.1, align 4
|
|
%gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
|
|
%l.a.2 = load i32, ptr %gep.a.2, align 4
|
|
|
|
%gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
|
|
%l.b.0 = load i32, ptr %gep.b.0, align 4
|
|
%gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
|
|
%l.b.1 = load i32, ptr %gep.b.1, align 4
|
|
%gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
|
|
%l.b.2 = load i32, ptr %gep.b.2, align 4
|
|
|
|
%mul.0 = mul nsw i32 %l.a.0, %l.b.0
|
|
%mul.1 = mul nsw i32 %l.a.1, %l.b.1
|
|
%mul.2 = mul nsw i32 %l.a.2, %l.b.2
|
|
|
|
%add.0 = add i32 %mul.0, %mul.1
|
|
%add.1 = add i32 %add.0, %mul.2
|
|
ret i32 %add.1
|
|
}
|
|
|
|
; Same as above, except the reduction order has been perturbed. This
|
|
; is checking for our ability to reorder.
|
|
define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
|
|
; NON-POW2-LABEL: @dot_product_i32_reorder(
|
|
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
|
|
; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
|
|
; NON-POW2-NEXT: ret i32 [[TMP4]]
|
|
;
|
|
; POW2-ONLY-LABEL: @dot_product_i32_reorder(
|
|
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
|
|
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
|
|
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
|
|
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
|
|
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
|
|
;
|
|
%gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
|
|
%l.a.0 = load i32, ptr %gep.a.0, align 4
|
|
%gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
|
|
%l.a.1 = load i32, ptr %gep.a.1, align 4
|
|
%gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
|
|
%l.a.2 = load i32, ptr %gep.a.2, align 4
|
|
|
|
%gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
|
|
%l.b.0 = load i32, ptr %gep.b.0, align 4
|
|
%gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
|
|
%l.b.1 = load i32, ptr %gep.b.1, align 4
|
|
%gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
|
|
%l.b.2 = load i32, ptr %gep.b.2, align 4
|
|
|
|
%mul.0 = mul nsw i32 %l.a.0, %l.b.0
|
|
%mul.1 = mul nsw i32 %l.a.1, %l.b.1
|
|
%mul.2 = mul nsw i32 %l.a.2, %l.b.2
|
|
|
|
%add.0 = add i32 %mul.1, %mul.0
|
|
%add.1 = add i32 %add.0, %mul.2
|
|
ret i32 %add.1
|
|
}
|
|
|
|
define float @dot_product_fp32(ptr %a, ptr %b) {
|
|
; NON-POW2-LABEL: @dot_product_fp32(
|
|
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
|
|
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
|
|
; NON-POW2-NEXT: ret float [[TMP4]]
|
|
;
|
|
; POW2-ONLY-LABEL: @dot_product_fp32(
|
|
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
|
|
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
|
|
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
|
|
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
|
|
; POW2-ONLY-NEXT: ret float [[ADD_1]]
|
|
;
|
|
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
|
|
%l.a.0 = load float, ptr %gep.a.0, align 4
|
|
%gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
|
|
%l.a.1 = load float, ptr %gep.a.1, align 4
|
|
%gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
|
|
%l.a.2 = load float, ptr %gep.a.2, align 4
|
|
|
|
%gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
|
|
%l.b.0 = load float, ptr %gep.b.0, align 4
|
|
%gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
|
|
%l.b.1 = load float, ptr %gep.b.1, align 4
|
|
%gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
|
|
%l.b.2 = load float, ptr %gep.b.2, align 4
|
|
|
|
%mul.0 = fmul fast float %l.a.0, %l.b.0
|
|
%mul.1 = fmul fast float %l.a.1, %l.b.1
|
|
%mul.2 = fmul fast float %l.a.2, %l.b.2
|
|
|
|
%add.0 = fadd fast float %mul.0, %mul.1
|
|
%add.1 = fadd fast float %add.0, %mul.2
|
|
ret float %add.1
|
|
}
|
|
|
|
; Same as above, except the reduction order has been perturbed. This
|
|
; is checking for our ability to reorder.
|
|
define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
|
|
; NON-POW2-LABEL: @dot_product_fp32_reorder(
|
|
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
|
|
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
|
|
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
|
|
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
|
|
; NON-POW2-NEXT: ret float [[TMP4]]
|
|
;
|
|
; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
|
|
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
|
|
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
|
|
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
|
|
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
|
|
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
|
|
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
|
|
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
|
|
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
|
|
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]]
|
|
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
|
|
; POW2-ONLY-NEXT: ret float [[ADD_1]]
|
|
;
|
|
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
|
|
%l.a.0 = load float, ptr %gep.a.0, align 4
|
|
%gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
|
|
%l.a.1 = load float, ptr %gep.a.1, align 4
|
|
%gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
|
|
%l.a.2 = load float, ptr %gep.a.2, align 4
|
|
|
|
%gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
|
|
%l.b.0 = load float, ptr %gep.b.0, align 4
|
|
%gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
|
|
%l.b.1 = load float, ptr %gep.b.1, align 4
|
|
%gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
|
|
%l.b.2 = load float, ptr %gep.b.2, align 4
|
|
|
|
%mul.0 = fmul fast float %l.a.0, %l.b.0
|
|
%mul.1 = fmul fast float %l.a.1, %l.b.1
|
|
%mul.2 = fmul fast float %l.a.2, %l.b.2
|
|
|
|
%add.0 = fadd fast float %mul.1, %mul.0
|
|
%add.1 = fadd fast float %add.0, %mul.2
|
|
ret float %add.1
|
|
}
|
|
|
|
|
|
define double @dot_product_fp64(ptr %a, ptr %b) {
|
|
; CHECK-LABEL: @dot_product_fp64(
|
|
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
|
|
; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
|
|
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
|
|
; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
|
|
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
|
|
; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
|
|
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
|
|
; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
|
|
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
|
|
; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
|
|
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
|
|
; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
|
|
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
|
|
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
|
|
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
|
|
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
|
|
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
|
|
; CHECK-NEXT: ret double [[ADD_1]]
|
|
;
|
|
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
|
|
%l.a.0 = load double, ptr %gep.a.0, align 4
|
|
%gep.a.1 = getelementptr inbounds double, ptr %a, i32 1
|
|
%l.a.1 = load double, ptr %gep.a.1, align 4
|
|
%gep.a.2 = getelementptr inbounds double, ptr %a, i32 2
|
|
%l.a.2 = load double, ptr %gep.a.2, align 4
|
|
|
|
%gep.b.0 = getelementptr inbounds double, ptr %b, i32 0
|
|
%l.b.0 = load double, ptr %gep.b.0, align 4
|
|
%gep.b.1 = getelementptr inbounds double, ptr %b, i32 1
|
|
%l.b.1 = load double, ptr %gep.b.1, align 4
|
|
%gep.b.2 = getelementptr inbounds double, ptr %b, i32 2
|
|
%l.b.2 = load double, ptr %gep.b.2, align 4
|
|
|
|
%mul.0 = fmul fast double %l.a.0, %l.b.0
|
|
%mul.1 = fmul fast double %l.a.1, %l.b.1
|
|
%mul.2 = fmul fast double %l.a.2, %l.b.2
|
|
|
|
%add.0 = fadd fast double %mul.0, %mul.1
|
|
%add.1 = fadd fast double %add.0, %mul.2
|
|
ret double %add.1
|
|
}
|
|
|
|
;; Covers a case where SLP would previous crash due to a
|
|
;; missing bailout in TryToFindDuplicates for the case
|
|
;; where a VL=3 list was vectorized directly (without
|
|
;; a root instruction such as a store or reduce).
|
|
define double @no_root_reshuffle(ptr %ptr) {
|
|
; CHECK-LABEL: @no_root_reshuffle(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8
|
|
; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]]
|
|
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
|
|
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
|
|
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]]
|
|
; CHECK-NEXT: [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]]
|
|
; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]]
|
|
; CHECK-NEXT: ret double [[ADD]]
|
|
;
|
|
entry:
|
|
%0 = load double, ptr %ptr, align 8
|
|
%mul = fmul fast double %0, %0
|
|
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8
|
|
%1 = load double, ptr %arrayidx2, align 8
|
|
%arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16
|
|
%2 = load double, ptr %arrayidx3, align 8
|
|
%3 = fmul fast double %2, %2
|
|
%mul6 = fmul fast double %3, %1
|
|
%add = fadd fast double %mul6, %mul
|
|
ret double %add
|
|
}
|
|
|
|
define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
|
|
; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
|
|
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
|
|
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
|
|
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
|
|
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
|
|
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
|
|
; CHECK-NEXT: ret float [[ADD_1]]
|
|
;
|
|
%mul.0 = fmul fast float %a, 10.0
|
|
%mul.1 = fmul fast float %b, 10.0
|
|
%mul.2 = fmul fast float %c, 10.0
|
|
|
|
%add.0 = fadd fast float %mul.0, %mul.1
|
|
%add.1 = fadd fast float %add.0, %mul.2
|
|
ret float %add.1
|
|
}
|
|
|
|
|
|
declare float @llvm.fmuladd.f32(float, float, float)
|
|
|
|
declare double @llvm.fmuladd.f64(double, double, double)
|