llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
David Green b24af43fdf
[AArch64] Improve scheduling latency into Bundles (#86310)
By default the scheduling info of instructions into a BUNDLE are given a
latency of 0 as they operate on the implicit register of the bundle.
This modifies that for AArch64 so that the latency is adjusted to use
the latency from the instruction in the bundle instead. This essentially
assumes that the bundled instructions are executed in a single cycle,
which for AArch64 is probably OK considering they are mostly used for
MOVPFX bundles, where this can help create slightly better scheduling
especially for in-order cores.
2024-04-12 10:57:01 +01:00

363 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -O3 -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
;
; FMA
;
; Don't use SVE for 64-bit vectors.
define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h
; CHECK-NEXT: fmov d0, d2
; CHECK-NEXT: ret
%mul = fmul contract <4 x half> %op1, %op2
%res = fadd contract <4 x half> %mul, %op3
ret <4 x half> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.8h, v0.8h, v1.8h
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%mul = fmul contract <8 x half> %op1, %op2
%res = fadd contract <8 x half> %mul, %op3
ret <8 x half> %res
}
define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x half>, ptr %a
%op2 = load <16 x half>, ptr %b
%op3 = load <16 x half>, ptr %c
%mul = fmul contract <16 x half> %op1, %op2
%res = fadd contract <16 x half> %mul, %op3
store <16 x half> %res, ptr %a
ret void
}
define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
; VBITS_GE_256-LABEL: fma_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; VBITS_GE_256-NEXT: movprfx z1, z5
; VBITS_GE_256-NEXT: fmla z1.h, p0/m, z3.h, z4.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, ptr %a
%op2 = load <32 x half>, ptr %b
%op3 = load <32 x half>, ptr %c
%mul = fmul contract <32 x half> %op1, %op2
%res = fadd contract <32 x half> %mul, %op3
store <32 x half> %res, ptr %a
ret void
}
define void @fma_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x half>, ptr %a
%op2 = load <64 x half>, ptr %b
%op3 = load <64 x half>, ptr %c
%mul = fmul contract <64 x half> %op1, %op2
%res = fadd contract <64 x half> %mul, %op3
store <64 x half> %res, ptr %a
ret void
}
define void @fma_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x half>, ptr %a
%op2 = load <128 x half>, ptr %b
%op3 = load <128 x half>, ptr %c
%mul = fmul contract <128 x half> %op1, %op2
%res = fadd contract <128 x half> %mul, %op3
store <128 x half> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s
; CHECK-NEXT: fmov d0, d2
; CHECK-NEXT: ret
%mul = fmul contract <2 x float> %op1, %op2
%res = fadd contract <2 x float> %mul, %op3
ret <2 x float> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%mul = fmul contract <4 x float> %op1, %op2
%res = fadd contract <4 x float> %mul, %op3
ret <4 x float> %res
}
define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x float>, ptr %a
%op2 = load <8 x float>, ptr %b
%op3 = load <8 x float>, ptr %c
%mul = fmul contract <8 x float> %op1, %op2
%res = fadd contract <8 x float> %mul, %op3
store <8 x float> %res, ptr %a
ret void
}
define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
; VBITS_GE_256-LABEL: fma_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; VBITS_GE_256-NEXT: movprfx z1, z5
; VBITS_GE_256-NEXT: fmla z1.s, p0/m, z3.s, z4.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, ptr %a
%op2 = load <16 x float>, ptr %b
%op3 = load <16 x float>, ptr %c
%mul = fmul contract <16 x float> %op1, %op2
%res = fadd contract <16 x float> %mul, %op3
store <16 x float> %res, ptr %a
ret void
}
define void @fma_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x float>, ptr %a
%op2 = load <32 x float>, ptr %b
%op3 = load <32 x float>, ptr %c
%mul = fmul contract <32 x float> %op1, %op2
%res = fadd contract <32 x float> %mul, %op3
store <32 x float> %res, ptr %a
ret void
}
define void @fma_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x float>, ptr %a
%op2 = load <64 x float>, ptr %b
%op3 = load <64 x float>, ptr %c
%mul = fmul contract <64 x float> %op1, %op2
%res = fadd contract <64 x float> %mul, %op3
store <64 x float> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmadd d0, d0, d1, d2
; CHECK-NEXT: ret
%mul = fmul contract <1 x double> %op1, %op2
%res = fadd contract <1 x double> %mul, %op3
ret <1 x double> %res
}
; Don't use SVE for 128-bit vectors.
define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2d, v0.2d, v1.2d
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%mul = fmul contract <2 x double> %op1, %op2
%res = fadd contract <2 x double> %mul, %op3
ret <2 x double> %res
}
define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x double>, ptr %a
%op2 = load <4 x double>, ptr %b
%op3 = load <4 x double>, ptr %c
%mul = fmul contract <4 x double> %op1, %op2
%res = fadd contract <4 x double> %mul, %op3
store <4 x double> %res, ptr %a
ret void
}
define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
; VBITS_GE_256-LABEL: fma_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; VBITS_GE_256-NEXT: movprfx z1, z5
; VBITS_GE_256-NEXT: fmla z1.d, p0/m, z3.d, z4.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, ptr %a
%op2 = load <8 x double>, ptr %b
%op3 = load <8 x double>, ptr %c
%mul = fmul contract <8 x double> %op1, %op2
%res = fadd contract <8 x double> %mul, %op3
store <8 x double> %res, ptr %a
ret void
}
define void @fma_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x double>, ptr %a
%op2 = load <16 x double>, ptr %b
%op3 = load <16 x double>, ptr %c
%mul = fmul contract <16 x double> %op1, %op2
%res = fadd contract <16 x double> %mul, %op3
store <16 x double> %res, ptr %a
ret void
}
define void @fma_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x double>, ptr %a
%op2 = load <32 x double>, ptr %b
%op3 = load <32 x double>, ptr %c
%mul = fmul contract <32 x double> %op1, %op2
%res = fadd contract <32 x double> %mul, %op3
store <32 x double> %res, ptr %a
ret void
}
attributes #0 = { "target-features"="+sve" }