
Matches dupq segmented lane splats in one of the operands of the fmul/fmla/fmls instructions, and uses the indexed form.
367 lines
15 KiB
LLVM
367 lines
15 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
|
|
|
|
define void @fmul_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmul_indexed_f16_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: fmul z0.h, z1.h, z0.h[2]
|
|
; CHECK-NEXT: str z0, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <16 x half>, ptr %a
|
|
%ld.b = load <16 x half>, ptr %b
|
|
%splat.lanes = shufflevector <16 x half> %ld.a, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
|
|
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
|
%res = fmul <16 x half> %ld.b, %splat.lanes
|
|
store <16 x half> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmul_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmul_indexed_bf16_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldp q0, q1, [x0]
|
|
; CHECK-NEXT: ldp q2, q3, [x1]
|
|
; CHECK-NEXT: dup v0.8h, v0.h[2]
|
|
; CHECK-NEXT: dup v1.8h, v1.h[2]
|
|
; CHECK-NEXT: shll v4.4s, v2.4h, #16
|
|
; CHECK-NEXT: shll v6.4s, v3.4h, #16
|
|
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
|
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
|
; CHECK-NEXT: shll v5.4s, v0.4h, #16
|
|
; CHECK-NEXT: shll v7.4s, v1.4h, #16
|
|
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
|
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
|
; CHECK-NEXT: fmul v4.4s, v4.4s, v5.4s
|
|
; CHECK-NEXT: fmul v5.4s, v6.4s, v7.4s
|
|
; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s
|
|
; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
|
|
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
|
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
|
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
|
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
|
; CHECK-NEXT: stp q2, q3, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <16 x bfloat>, ptr %a
|
|
%ld.b = load <16 x bfloat>, ptr %b
|
|
%splat.lanes = shufflevector <16 x bfloat> %ld.a, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
|
|
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
|
%res = fmul <16 x bfloat> %ld.b, %splat.lanes
|
|
store <16 x bfloat> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmul_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmul_indexed_f32_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: fmul z0.s, z1.s, z0.s[3]
|
|
; CHECK-NEXT: str z0, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <8 x float>, ptr %a
|
|
%ld.b = load <8 x float>, ptr %b
|
|
%splat.lanes = shufflevector <8 x float> %ld.a, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
|
|
i32 7, i32 7, i32 7, i32 7>
|
|
%res = fmul <8 x float> %splat.lanes, %ld.b
|
|
store <8 x float> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmul_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmul_indexed_f64_256b_trn1:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: fmul z0.d, z1.d, z0.d[0]
|
|
; CHECK-NEXT: str z0, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <4 x double>, ptr %a
|
|
%ld.b = load <4 x double>, ptr %b
|
|
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
|
|
%res = fmul <4 x double> %splat.lanes, %ld.b
|
|
store <4 x double> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmul_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmul_indexed_f64_256b_trn2:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: fmul z0.d, z1.d, z0.d[1]
|
|
; CHECK-NEXT: str z0, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <4 x double>, ptr %a
|
|
%ld.b = load <4 x double>, ptr %b
|
|
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
|
|
%res = fmul <4 x double> %ld.b, %splat.lanes
|
|
store <4 x double> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmla_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmla_indexed_f16_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmla z2.h, z1.h, z0.h[2]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <16 x half>, ptr %a
|
|
%ld.b = load <16 x half>, ptr %b
|
|
%ld.c = load <16 x half>, ptr %c
|
|
%splat.lanes = shufflevector <16 x half> %ld.a, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
|
|
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
|
%res = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> %ld.b, <16 x half> %splat.lanes, <16 x half> %ld.c)
|
|
store <16 x half> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmla_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmla_indexed_bf16_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldp q0, q1, [x0]
|
|
; CHECK-NEXT: ldp q2, q3, [x1]
|
|
; CHECK-NEXT: dup v0.8h, v0.h[2]
|
|
; CHECK-NEXT: dup v1.8h, v1.h[2]
|
|
; CHECK-NEXT: shll v4.4s, v2.4h, #16
|
|
; CHECK-NEXT: shll v6.4s, v3.4h, #16
|
|
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
|
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
|
; CHECK-NEXT: shll v5.4s, v0.4h, #16
|
|
; CHECK-NEXT: shll v7.4s, v1.4h, #16
|
|
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
|
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
|
; CHECK-NEXT: fmul v4.4s, v4.4s, v5.4s
|
|
; CHECK-NEXT: fmul v5.4s, v6.4s, v7.4s
|
|
; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s
|
|
; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
|
|
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
|
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
|
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
|
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
|
; CHECK-NEXT: ldp q0, q1, [x2]
|
|
; CHECK-NEXT: shll v4.4s, v0.4h, #16
|
|
; CHECK-NEXT: shll v5.4s, v2.4h, #16
|
|
; CHECK-NEXT: shll v6.4s, v1.4h, #16
|
|
; CHECK-NEXT: shll v7.4s, v3.4h, #16
|
|
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
|
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
|
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
|
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
|
; CHECK-NEXT: fadd v4.4s, v5.4s, v4.4s
|
|
; CHECK-NEXT: fadd v5.4s, v7.4s, v6.4s
|
|
; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
|
|
; CHECK-NEXT: fadd v1.4s, v3.4s, v1.4s
|
|
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
|
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
|
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
|
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
|
; CHECK-NEXT: stp q2, q3, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <16 x bfloat>, ptr %a
|
|
%ld.b = load <16 x bfloat>, ptr %b
|
|
%ld.c = load <16 x bfloat>, ptr %c
|
|
%splat.lanes = shufflevector <16 x bfloat> %ld.a, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
|
|
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
|
%res = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> %ld.b, <16 x bfloat> %splat.lanes, <16 x bfloat> %ld.c)
|
|
store <16 x bfloat> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmla_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmla_indexed_f32_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmla z2.s, z1.s, z0.s[3]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <8 x float>, ptr %a
|
|
%ld.b = load <8 x float>, ptr %b
|
|
%ld.c = load <8 x float>, ptr %c
|
|
%splat.lanes = shufflevector <8 x float> %ld.a, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
|
|
i32 7, i32 7, i32 7, i32 7>
|
|
%res = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %splat.lanes, <8 x float> %ld.b, <8 x float> %ld.c)
|
|
store <8 x float> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmla_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmla_indexed_f64_256b_trn1:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmla z2.d, z1.d, z0.d[0]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <4 x double>, ptr %a
|
|
%ld.b = load <4 x double>, ptr %b
|
|
%ld.c = load <4 x double>, ptr %c
|
|
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
|
|
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %splat.lanes, <4 x double> %ld.b, <4 x double> %ld.c)
|
|
store <4 x double> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmla_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmla_indexed_f64_256b_trn2:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmla z2.d, z1.d, z0.d[1]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <4 x double>, ptr %a
|
|
%ld.b = load <4 x double>, ptr %b
|
|
%ld.c = load <4 x double>, ptr %c
|
|
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
|
|
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %ld.b, <4 x double> %splat.lanes, <4 x double> %ld.c)
|
|
store <4 x double> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmls_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmls_indexed_f16_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmls z2.h, z1.h, z0.h[2]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <16 x half>, ptr %a
|
|
%ld.b = load <16 x half>, ptr %b
|
|
%ld.c = load <16 x half>, ptr %c
|
|
%splat.lanes = shufflevector <16 x half> %ld.a, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
|
|
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
|
%neg.b = fneg <16 x half> %ld.b
|
|
%res = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> %neg.b, <16 x half> %splat.lanes, <16 x half> %ld.c)
|
|
store <16 x half> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmls_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmls_indexed_bf16_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldp q0, q1, [x0]
|
|
; CHECK-NEXT: ldp q2, q3, [x1]
|
|
; CHECK-NEXT: dup v0.8h, v0.h[2]
|
|
; CHECK-NEXT: dup v1.8h, v1.h[2]
|
|
; CHECK-NEXT: shll v4.4s, v2.4h, #16
|
|
; CHECK-NEXT: shll v6.4s, v3.4h, #16
|
|
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
|
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
|
; CHECK-NEXT: shll v5.4s, v0.4h, #16
|
|
; CHECK-NEXT: shll v7.4s, v1.4h, #16
|
|
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
|
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
|
; CHECK-NEXT: fmul v4.4s, v4.4s, v5.4s
|
|
; CHECK-NEXT: fmul v5.4s, v6.4s, v7.4s
|
|
; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s
|
|
; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
|
|
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
|
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
|
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
|
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
|
; CHECK-NEXT: ldp q0, q1, [x2]
|
|
; CHECK-NEXT: shll v4.4s, v0.4h, #16
|
|
; CHECK-NEXT: shll v5.4s, v2.4h, #16
|
|
; CHECK-NEXT: shll v6.4s, v1.4h, #16
|
|
; CHECK-NEXT: shll v7.4s, v3.4h, #16
|
|
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
|
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
|
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
|
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
|
; CHECK-NEXT: fsub v4.4s, v4.4s, v5.4s
|
|
; CHECK-NEXT: fsub v5.4s, v6.4s, v7.4s
|
|
; CHECK-NEXT: fsub v0.4s, v0.4s, v2.4s
|
|
; CHECK-NEXT: fsub v1.4s, v1.4s, v3.4s
|
|
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
|
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
|
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
|
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
|
; CHECK-NEXT: stp q2, q3, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <16 x bfloat>, ptr %a
|
|
%ld.b = load <16 x bfloat>, ptr %b
|
|
%ld.c = load <16 x bfloat>, ptr %c
|
|
%splat.lanes = shufflevector <16 x bfloat> %ld.a, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
|
|
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
|
|
%neg.b = fneg <16 x bfloat> %ld.b
|
|
%res = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> %neg.b, <16 x bfloat> %splat.lanes, <16 x bfloat> %ld.c)
|
|
store <16 x bfloat> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmls_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmls_indexed_f32_256b:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmls z2.s, z1.s, z0.s[3]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <8 x float>, ptr %a
|
|
%ld.b = load <8 x float>, ptr %b
|
|
%ld.c = load <8 x float>, ptr %c
|
|
%splat.lanes = shufflevector <8 x float> %ld.a, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
|
|
i32 7, i32 7, i32 7, i32 7>
|
|
%neg.b = fneg <8 x float> %ld.b
|
|
%res = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %splat.lanes, <8 x float> %neg.b, <8 x float> %ld.c)
|
|
store <8 x float> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmls_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmls_indexed_f64_256b_trn1:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmls z2.d, z1.d, z0.d[0]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <4 x double>, ptr %a
|
|
%ld.b = load <4 x double>, ptr %b
|
|
%ld.c = load <4 x double>, ptr %c
|
|
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
|
|
%neg.b = fneg <4 x double> %ld.b
|
|
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %splat.lanes, <4 x double> %neg.b, <4 x double> %ld.c)
|
|
store <4 x double> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
define void @fmls_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 {
|
|
; CHECK-LABEL: fmls_indexed_f64_256b_trn2:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: ldr z0, [x0]
|
|
; CHECK-NEXT: ldr z1, [x1]
|
|
; CHECK-NEXT: ldr z2, [x2]
|
|
; CHECK-NEXT: fmls z2.d, z1.d, z0.d[1]
|
|
; CHECK-NEXT: str z2, [x2]
|
|
; CHECK-NEXT: ret
|
|
%ld.a = load <4 x double>, ptr %a
|
|
%ld.b = load <4 x double>, ptr %b
|
|
%ld.c = load <4 x double>, ptr %c
|
|
%splat.lanes = shufflevector <4 x double> %ld.a, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
|
|
%neg.b = fneg <4 x double> %ld.b
|
|
%res = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %neg.b, <4 x double> %splat.lanes, <4 x double> %ld.c)
|
|
store <4 x double> %res, ptr %c
|
|
ret void
|
|
}
|
|
|
|
declare <16 x half> @llvm.fmuladd.v16f16(<16 x half>, <16 x half>, <16 x half>);
|
|
declare <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>);
|
|
declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>);
|
|
declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>);
|
|
|
|
attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16,+sve-b16b16" }
|