Alexey Bataev 7bc079c852
[TTI]Fallback to SingleSrcPermute shuffle kind, if no direct estimation for
extract subvector.

Many targets do not have cost for extractsubvector shuffle kind, but
have the costs for single source permute. If there are no costs
estimation for extractsubvector, better to switchto single source
permute for better cost estimation.

Reviewers: RKSimon, davemgreen, arsenm

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/79837
2024-02-12 07:09:49 -05:00

140 lines
7.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: aarch64-registered-target
; RUN: opt -passes='lower-matrix-intrinsics' -mtriple=arm64-apple-iphoneos -S < %s | FileCheck %s
define <1 x float> @dotproduct_float_v6(<6 x float> %a, <6 x float> %b) {
; CHECK-LABEL: @dotproduct_float_v6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fmul <6 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x float> [[TMP2]]
;
entry:
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 1, i32 6, i32 1)
ret <1 x float> %c
}
declare <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32)
define <1 x float> @dotproduct_float_v1(<1 x float> %a, <1 x float> %b) {
; CHECK-LABEL: @dotproduct_float_v1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fmul <1 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x float> [[TMP2]]
;
entry:
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v1f32.v1f32(<1 x float> %a, <1 x float> %b, i32 1, i32 1, i32 1)
ret <1 x float> %c
}
declare <1 x float> @llvm.matrix.multiply.v1f32.v1f32.v1f32(<1 x float>, <1 x float>, i32, i32, i32)
define <1 x float> @dotproduct_float_v3(<3 x float> %a, <3 x float> %b) {
; CHECK-LABEL: @dotproduct_float_v3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fmul <3 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x float> [[TMP2]]
;
entry:
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v3f32.v3f32(<3 x float> %a, <3 x float> %b, i32 1, i32 3, i32 1)
ret <1 x float> %c
}
declare <1 x float> @llvm.matrix.multiply.v1f32.v3f32.v3f32(<3 x float>, <3 x float>, i32, i32, i32)
define <1 x float> @intrinsic_column_major_load_dot_product_float_v6(ptr %lhs_address, ptr %rhs_address) {
; CHECK-LABEL: @intrinsic_column_major_load_dot_product_float_v6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <6 x float>, ptr [[RHS_ADDRESS:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x float>, ptr [[LHS_ADDRESS:%.*]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = fmul <6 x float> [[TMP0]], [[COL_LOAD]]
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i64 0
; CHECK-NEXT: ret <1 x float> [[TMP3]]
;
entry:
%lhs = tail call fast <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)
%rhs = tail call fast <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr nonnull align 4 %rhs_address, i64 6, i1 false, i32 6, i32 1)
%result = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %lhs, <6 x float> %rhs, i32 1, i32 6, i32 1)
ret <1 x float> %result
}
declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr nonnull align 4, i64, i1, i32, i32)
define <1 x float> @LoadInst_dot_product_float_v7(ptr %lhs_address, ptr %rhs_address) {
; CHECK-LABEL: @LoadInst_dot_product_float_v7(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[LHS:%.*]] = load <7 x float>, ptr [[LHS_ADDRESS:%.*]], align 32
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <7 x float>, ptr [[RHS_ADDRESS:%.*]], align 32
; CHECK-NEXT: [[TMP0:%.*]] = fmul <7 x float> [[LHS]], [[COL_LOAD]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v7f32(float 0.000000e+00, <7 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x float> [[TMP2]]
;
entry:
%lhs = load <7 x float>, ptr %lhs_address
%rhs = load <7 x float>, ptr %rhs_address
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v7f32.v7f32(<7 x float> %lhs, <7 x float> %rhs, i32 1, i32 7, i32 1)
ret <1 x float> %c
}
declare <1 x float> @llvm.matrix.multiply.v1f32.v7f32.v7f32(<7 x float>, <7 x float>, i32, i32, i32)
define <1 x double> @dotproduct_double_v6(<6 x double> %a, <6 x double> %b) {
; CHECK-LABEL: @dotproduct_double_v6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fmul <6 x double> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x double> poison, double [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x double> [[TMP2]]
;
entry:
%c = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 1, i32 6, i32 1)
ret <1 x double> %c
}
declare <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
define <1 x double> @intrinsic_column_major_load_dot_product_double_v6(ptr %lhs_address, ptr %rhs_address) {
; CHECK-LABEL: @intrinsic_column_major_load_dot_product_double_v6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
; CHECK-NEXT: [[TMP1:%.*]] = fmul <6 x double> [[TMP0]], [[COL_LOAD]]
; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
; CHECK-NEXT: ret <1 x double> [[TMP3]]
;
entry:
%lhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)
%rhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %rhs_address, i64 6, i1 false, i32 6, i32 1)
%result = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double> %lhs, <6 x double> %rhs, i32 1, i32 6, i32 1)
ret <1 x double> %result
}
declare <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4, i64, i1, i32, i32)
define <1 x double> @LoadInst_dot_product_double_v7(ptr %lhs_address, ptr %rhs_address) {
; CHECK-LABEL: @LoadInst_dot_product_double_v7(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[LHS:%.*]] = load <7 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <7 x double>, ptr [[RHS_ADDRESS:%.*]], align 64
; CHECK-NEXT: [[TMP0:%.*]] = fmul <7 x double> [[LHS]], [[COL_LOAD]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x double> poison, double [[TMP1]], i64 0
; CHECK-NEXT: ret <1 x double> [[TMP2]]
;
entry:
%lhs = load <7 x double>, ptr %lhs_address
%rhs = load <7 x double>, ptr %rhs_address
%c = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v7f64.v7f64(<7 x double> %lhs, <7 x double> %rhs, i32 1, i32 7, i32 1)
ret <1 x double> %c
}
declare <1 x double> @llvm.matrix.multiply.v1f64.v7f64.v7f64(<7 x double>, <7 x double>, i32, i32, i32)