llvm-project/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
David Majnemer 3dd6750027 [AArch64] Add more complete support for BF16
We can use a small amount of integer arithmetic to round FP32 to BF16
and extend BF16 to FP32.

While a number of operations still require promotion, this can be
reduced for some rather simple operations like abs, copysign, fneg but
these can be done in a follow-up.

A few neat optimizations are implemented:
- round-inexact-to-odd is used for F64 to BF16 rounding.
- quieting signaling NaNs for f32 -> bf16 tries to detect if a prior
  operation makes it unnecessary.
2024-03-03 22:39:50 +00:00

221 lines
8.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) #2
declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) #2
define <8 x half> @test1(<4 x float> noundef %a) {
; CHECK-LABEL: test1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NEXT: ret
entry:
%vcvt_f16_f321.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a)
%0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half>
%shuffle.i = shufflevector <4 x half> %0, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %shuffle.i
}
define <8 x i8> @test2(ptr nocapture noundef readonly %in, <8 x i8> noundef %idx) {
; CHECK-LABEL: test2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: shrn v1.8b, v1.8h, #4
; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %in, align 2
%1 = lshr <8 x i16> %0, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
%vshrn_n = trunc <8 x i16> %1 to <8 x i8>
%vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx)
ret <8 x i8> %vtbl11.i
}
define <8 x i8> @tbl1v8i8(ptr nocapture noundef readonly %in, <8 x i8> noundef %idx) {
; CHECK-LABEL: tbl1v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: shrn v1.8b, v1.8h, #4
; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %in, align 2
%1 = lshr <8 x i16> %0, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
%vshrn_n = trunc <8 x i16> %1 to <8 x i8>
%vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx)
ret <8 x i8> %vtbl11.i
}
define <8 x i16> @addpv4i16(<4 x i16> noundef %a, <4 x i16> noundef %b) {
; CHECK-LABEL: addpv4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
entry:
%vpadd_v2.i = tail call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b)
%shuffle.i = shufflevector <4 x i16> %vpadd_v2.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle.i
}
define <8 x i16> @addv4i16(<4 x i16> noundef %a, <4 x i16> noundef %b) {
; CHECK-LABEL: addv4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
; CHECK-NEXT: ret
entry:
%add.i = add <4 x i16> %b, %a
%shuffle.i = shufflevector <4 x i16> %add.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle.i
}
define <16 x i8> @rshrn(<8 x i16> noundef %a, <4 x i16> noundef %b) {
; CHECK-LABEL: rshrn:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rshrn v0.8b, v0.8h, #3
; CHECK-NEXT: ret
entry:
%vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a, i32 3)
%shuffle.i = shufflevector <8 x i8> %vrshrn_n1, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %shuffle.i
}
define <16 x i8> @tbl1(<16 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: tbl1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
; CHECK-NEXT: ret
entry:
%vtbl11 = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
%shuffle.i = shufflevector <8 x i8> %vtbl11, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %shuffle.i
}
define <2 x double> @fadd(double noundef %x, double noundef %y) {
; CHECK-LABEL: fadd:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: fadd d0, d0, d1
; CHECK-NEXT: mov v2.d[0], v0.d[0]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
entry:
%add = fadd double %x, %y
%vecinit1 = insertelement <2 x double> poison, double %add, i64 0
%vecinit2 = insertelement <2 x double> %vecinit1, double 0.0, i64 1
ret <2 x double> %vecinit2
}
define <16 x i8> @bsl(<4 x i16> noundef %a, <4 x i16> noundef %c, <4 x i16> noundef %d, <4 x i16> noundef %b) {
; CHECK-LABEL: bsl:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: bsl v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
entry:
%vbsl3.i = and <4 x i16> %c, %a
%0 = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
%vbsl4.i = and <4 x i16> %0, %d
%vbsl5.i = or <4 x i16> %vbsl4.i, %vbsl3.i
%1 = bitcast <4 x i16> %vbsl5.i to <8 x i8>
%shuffle.i = shufflevector <8 x i8> %1, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %shuffle.i
}
define <16 x i8> @load(ptr %a, <8 x i8> %b) {
; CHECK-LABEL: load:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret
entry:
%vtbl11 = load <8 x i8>, ptr %a
%shuffle.i = shufflevector <8 x i8> %vtbl11, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %shuffle.i
}
define <16 x i8> @insertzero_v8i8(<8 x i8> %a) {
; CHECK-LABEL: insertzero_v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %shuffle.i
}
define <8 x i16> @insertzero_v4i16(<4 x i16> %a) {
; CHECK-LABEL: insertzero_v4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle.i
}
define <4 x i32> @insertzero_v2i32(<2 x i32> %a) {
; CHECK-LABEL: insertzero_v2i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %shuffle.i
}
define <2 x i64> @insertzero_v1i64(<1 x i64> %a) {
; CHECK-LABEL: insertzero_v1i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <1 x i64> %a, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
ret <2 x i64> %shuffle.i
}
define <8 x half> @insertzero_v4f16(<4 x half> %a) {
; CHECK-LABEL: insertzero_v4f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %shuffle.i
}
define <8 x bfloat> @insertzero_v4bf16(<4 x bfloat> %a) {
; CHECK-LABEL: insertzero_v4bf16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x bfloat> %shuffle.i
}
define <4 x float> @insertzero_v2f32(<2 x float> %a) {
; CHECK-LABEL: insertzero_v2f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle.i
}
define <2 x double> @insertzero_v1f64(<1 x double> %a) {
; CHECK-LABEL: insertzero_v1f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, d0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <1 x double> %a, <1 x double> zeroinitializer, <2 x i32> <i32 0, i32 1>
ret <2 x double> %shuffle.i
}
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)