
extract subvector. Many targets do not have cost for extractsubvector shuffle kind, but have the costs for single source permute. If there are no costs estimation for extractsubvector, better to switchto single source permute for better cost estimation. Reviewers: RKSimon, davemgreen, arsenm Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/79837
140 lines
7.5 KiB
LLVM
140 lines
7.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; REQUIRES: aarch64-registered-target
|
|
; RUN: opt -passes='lower-matrix-intrinsics' -mtriple=arm64-apple-iphoneos -S < %s | FileCheck %s
|
|
|
|
define <1 x float> @dotproduct_float_v6(<6 x float> %a, <6 x float> %b) {
|
|
; CHECK-LABEL: @dotproduct_float_v6(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = fmul <6 x float> [[A:%.*]], [[B:%.*]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP0]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
|
|
; CHECK-NEXT: ret <1 x float> [[TMP2]]
|
|
;
|
|
entry:
|
|
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 1, i32 6, i32 1)
|
|
ret <1 x float> %c
|
|
}
|
|
|
|
declare <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32)
|
|
|
|
define <1 x float> @dotproduct_float_v1(<1 x float> %a, <1 x float> %b) {
|
|
; CHECK-LABEL: @dotproduct_float_v1(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = fmul <1 x float> [[A:%.*]], [[B:%.*]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> [[TMP0]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
|
|
; CHECK-NEXT: ret <1 x float> [[TMP2]]
|
|
;
|
|
entry:
|
|
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v1f32.v1f32(<1 x float> %a, <1 x float> %b, i32 1, i32 1, i32 1)
|
|
ret <1 x float> %c
|
|
}
|
|
|
|
declare <1 x float> @llvm.matrix.multiply.v1f32.v1f32.v1f32(<1 x float>, <1 x float>, i32, i32, i32)
|
|
|
|
define <1 x float> @dotproduct_float_v3(<3 x float> %a, <3 x float> %b) {
|
|
; CHECK-LABEL: @dotproduct_float_v3(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = fmul <3 x float> [[A:%.*]], [[B:%.*]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP0]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
|
|
; CHECK-NEXT: ret <1 x float> [[TMP2]]
|
|
;
|
|
entry:
|
|
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v3f32.v3f32(<3 x float> %a, <3 x float> %b, i32 1, i32 3, i32 1)
|
|
ret <1 x float> %c
|
|
}
|
|
|
|
declare <1 x float> @llvm.matrix.multiply.v1f32.v3f32.v3f32(<3 x float>, <3 x float>, i32, i32, i32)
|
|
|
|
define <1 x float> @intrinsic_column_major_load_dot_product_float_v6(ptr %lhs_address, ptr %rhs_address) {
|
|
; CHECK-LABEL: @intrinsic_column_major_load_dot_product_float_v6(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <6 x float>, ptr [[RHS_ADDRESS:%.*]], align 4
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x float>, ptr [[LHS_ADDRESS:%.*]], align 32
|
|
; CHECK-NEXT: [[TMP1:%.*]] = fmul <6 x float> [[TMP0]], [[COL_LOAD]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP1]])
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i64 0
|
|
; CHECK-NEXT: ret <1 x float> [[TMP3]]
|
|
;
|
|
entry:
|
|
%lhs = tail call fast <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)
|
|
%rhs = tail call fast <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr nonnull align 4 %rhs_address, i64 6, i1 false, i32 6, i32 1)
|
|
%result = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %lhs, <6 x float> %rhs, i32 1, i32 6, i32 1)
|
|
ret <1 x float> %result
|
|
}
|
|
|
|
declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr nonnull align 4, i64, i1, i32, i32)
|
|
|
|
define <1 x float> @LoadInst_dot_product_float_v7(ptr %lhs_address, ptr %rhs_address) {
|
|
; CHECK-LABEL: @LoadInst_dot_product_float_v7(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[LHS:%.*]] = load <7 x float>, ptr [[LHS_ADDRESS:%.*]], align 32
|
|
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <7 x float>, ptr [[RHS_ADDRESS:%.*]], align 32
|
|
; CHECK-NEXT: [[TMP0:%.*]] = fmul <7 x float> [[LHS]], [[COL_LOAD]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v7f32(float 0.000000e+00, <7 x float> [[TMP0]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0
|
|
; CHECK-NEXT: ret <1 x float> [[TMP2]]
|
|
;
|
|
entry:
|
|
%lhs = load <7 x float>, ptr %lhs_address
|
|
%rhs = load <7 x float>, ptr %rhs_address
|
|
%c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v7f32.v7f32(<7 x float> %lhs, <7 x float> %rhs, i32 1, i32 7, i32 1)
|
|
ret <1 x float> %c
|
|
}
|
|
|
|
declare <1 x float> @llvm.matrix.multiply.v1f32.v7f32.v7f32(<7 x float>, <7 x float>, i32, i32, i32)
|
|
|
|
define <1 x double> @dotproduct_double_v6(<6 x double> %a, <6 x double> %b) {
|
|
; CHECK-LABEL: @dotproduct_double_v6(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = fmul <6 x double> [[A:%.*]], [[B:%.*]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP0]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x double> poison, double [[TMP1]], i64 0
|
|
; CHECK-NEXT: ret <1 x double> [[TMP2]]
|
|
;
|
|
entry:
|
|
%c = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double> %a, <6 x double> %b, i32 1, i32 6, i32 1)
|
|
ret <1 x double> %c
|
|
}
|
|
|
|
declare <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double>, <6 x double>, i32, i32, i32)
|
|
|
|
define <1 x double> @intrinsic_column_major_load_dot_product_double_v6(ptr %lhs_address, ptr %rhs_address) {
|
|
; CHECK-LABEL: @intrinsic_column_major_load_dot_product_double_v6(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
|
|
; CHECK-NEXT: [[TMP1:%.*]] = fmul <6 x double> [[TMP0]], [[COL_LOAD]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP1]])
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
|
|
; CHECK-NEXT: ret <1 x double> [[TMP3]]
|
|
;
|
|
entry:
|
|
%lhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6)
|
|
%rhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %rhs_address, i64 6, i1 false, i32 6, i32 1)
|
|
%result = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v6f64.v6f64(<6 x double> %lhs, <6 x double> %rhs, i32 1, i32 6, i32 1)
|
|
ret <1 x double> %result
|
|
}
|
|
|
|
declare <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4, i64, i1, i32, i32)
|
|
|
|
define <1 x double> @LoadInst_dot_product_double_v7(ptr %lhs_address, ptr %rhs_address) {
|
|
; CHECK-LABEL: @LoadInst_dot_product_double_v7(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[LHS:%.*]] = load <7 x double>, ptr [[LHS_ADDRESS:%.*]], align 64
|
|
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <7 x double>, ptr [[RHS_ADDRESS:%.*]], align 64
|
|
; CHECK-NEXT: [[TMP0:%.*]] = fmul <7 x double> [[LHS]], [[COL_LOAD]]
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> [[TMP0]])
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x double> poison, double [[TMP1]], i64 0
|
|
; CHECK-NEXT: ret <1 x double> [[TMP2]]
|
|
;
|
|
entry:
|
|
%lhs = load <7 x double>, ptr %lhs_address
|
|
%rhs = load <7 x double>, ptr %rhs_address
|
|
%c = tail call fast <1 x double> @llvm.matrix.multiply.v1f64.v7f64.v7f64(<7 x double> %lhs, <7 x double> %rhs, i32 1, i32 7, i32 1)
|
|
ret <1 x double> %c
|
|
}
|
|
|
|
declare <1 x double> @llvm.matrix.multiply.v1f64.v7f64.v7f64(<7 x double>, <7 x double>, i32, i32, i32)
|