
We can use a small amount of integer arithmetic to round FP32 to BF16 and extend BF16 to FP32. While a number of operations still require promotion, this can be reduced for some rather simple operations like abs, copysign, fneg but these can be done in a follow-up. A few neat optimizations are implemented: - round-inexact-to-odd is used for F64 to BF16 rounding. - quieting signaling NaNs for f32 -> bf16 tries to detect if a prior operation makes it unnecessary.
221 lines
8.5 KiB
LLVM
221 lines
8.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
|
|
|
|
declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) #2
|
|
declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) #2
|
|
|
|
define <8 x half> @test1(<4 x float> noundef %a) {
|
|
; CHECK-LABEL: test1:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fcvtn v0.4h, v0.4s
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vcvt_f16_f321.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a)
|
|
%0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half>
|
|
%shuffle.i = shufflevector <4 x half> %0, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
ret <8 x half> %shuffle.i
|
|
}
|
|
|
|
define <8 x i8> @test2(ptr nocapture noundef readonly %in, <8 x i8> noundef %idx) {
|
|
; CHECK-LABEL: test2:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: ldr q1, [x0]
|
|
; CHECK-NEXT: shrn v1.8b, v1.8h, #4
|
|
; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = load <8 x i16>, ptr %in, align 2
|
|
%1 = lshr <8 x i16> %0, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
|
|
%vshrn_n = trunc <8 x i16> %1 to <8 x i8>
|
|
%vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
%vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx)
|
|
ret <8 x i8> %vtbl11.i
|
|
}
|
|
|
|
define <8 x i8> @tbl1v8i8(ptr nocapture noundef readonly %in, <8 x i8> noundef %idx) {
|
|
; CHECK-LABEL: tbl1v8i8:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: ldr q1, [x0]
|
|
; CHECK-NEXT: shrn v1.8b, v1.8h, #4
|
|
; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = load <8 x i16>, ptr %in, align 2
|
|
%1 = lshr <8 x i16> %0, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
|
|
%vshrn_n = trunc <8 x i16> %1 to <8 x i8>
|
|
%vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
%vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx)
|
|
ret <8 x i8> %vtbl11.i
|
|
}
|
|
|
|
define <8 x i16> @addpv4i16(<4 x i16> noundef %a, <4 x i16> noundef %b) {
|
|
; CHECK-LABEL: addpv4i16:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: addp v0.4h, v0.4h, v1.4h
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vpadd_v2.i = tail call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b)
|
|
%shuffle.i = shufflevector <4 x i16> %vpadd_v2.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
ret <8 x i16> %shuffle.i
|
|
}
|
|
|
|
define <8 x i16> @addv4i16(<4 x i16> noundef %a, <4 x i16> noundef %b) {
|
|
; CHECK-LABEL: addv4i16:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%add.i = add <4 x i16> %b, %a
|
|
%shuffle.i = shufflevector <4 x i16> %add.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
ret <8 x i16> %shuffle.i
|
|
}
|
|
|
|
define <16 x i8> @rshrn(<8 x i16> noundef %a, <4 x i16> noundef %b) {
|
|
; CHECK-LABEL: rshrn:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: rshrn v0.8b, v0.8h, #3
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a, i32 3)
|
|
%shuffle.i = shufflevector <8 x i8> %vrshrn_n1, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
ret <16 x i8> %shuffle.i
|
|
}
|
|
|
|
define <16 x i8> @tbl1(<16 x i8> %a, <8 x i8> %b) {
|
|
; CHECK-LABEL: tbl1:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vtbl11 = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
|
|
%shuffle.i = shufflevector <8 x i8> %vtbl11, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
ret <16 x i8> %shuffle.i
|
|
}
|
|
|
|
define <2 x double> @fadd(double noundef %x, double noundef %y) {
|
|
; CHECK-LABEL: fadd:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: movi v2.2d, #0000000000000000
|
|
; CHECK-NEXT: fadd d0, d0, d1
|
|
; CHECK-NEXT: mov v2.d[0], v0.d[0]
|
|
; CHECK-NEXT: mov v0.16b, v2.16b
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%add = fadd double %x, %y
|
|
%vecinit1 = insertelement <2 x double> poison, double %add, i64 0
|
|
%vecinit2 = insertelement <2 x double> %vecinit1, double 0.0, i64 1
|
|
ret <2 x double> %vecinit2
|
|
}
|
|
|
|
define <16 x i8> @bsl(<4 x i16> noundef %a, <4 x i16> noundef %c, <4 x i16> noundef %d, <4 x i16> noundef %b) {
|
|
; CHECK-LABEL: bsl:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: bsl v0.8b, v1.8b, v2.8b
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vbsl3.i = and <4 x i16> %c, %a
|
|
%0 = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
|
|
%vbsl4.i = and <4 x i16> %0, %d
|
|
%vbsl5.i = or <4 x i16> %vbsl4.i, %vbsl3.i
|
|
%1 = bitcast <4 x i16> %vbsl5.i to <8 x i8>
|
|
%shuffle.i = shufflevector <8 x i8> %1, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
ret <16 x i8> %shuffle.i
|
|
}
|
|
|
|
define <16 x i8> @load(ptr %a, <8 x i8> %b) {
|
|
; CHECK-LABEL: load:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: ldr d0, [x0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%vtbl11 = load <8 x i8>, ptr %a
|
|
%shuffle.i = shufflevector <8 x i8> %vtbl11, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
ret <16 x i8> %shuffle.i
|
|
}
|
|
|
|
|
|
define <16 x i8> @insertzero_v8i8(<8 x i8> %a) {
|
|
; CHECK-LABEL: insertzero_v8i8:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
ret <16 x i8> %shuffle.i
|
|
}
|
|
|
|
define <8 x i16> @insertzero_v4i16(<4 x i16> %a) {
|
|
; CHECK-LABEL: insertzero_v4i16:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
ret <8 x i16> %shuffle.i
|
|
}
|
|
|
|
define <4 x i32> @insertzero_v2i32(<2 x i32> %a) {
|
|
; CHECK-LABEL: insertzero_v2i32:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
ret <4 x i32> %shuffle.i
|
|
}
|
|
|
|
define <2 x i64> @insertzero_v1i64(<1 x i64> %a) {
|
|
; CHECK-LABEL: insertzero_v1i64:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <1 x i64> %a, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
|
|
ret <2 x i64> %shuffle.i
|
|
}
|
|
|
|
define <8 x half> @insertzero_v4f16(<4 x half> %a) {
|
|
; CHECK-LABEL: insertzero_v4f16:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
ret <8 x half> %shuffle.i
|
|
}
|
|
|
|
define <8 x bfloat> @insertzero_v4bf16(<4 x bfloat> %a) {
|
|
; CHECK-LABEL: insertzero_v4bf16:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
ret <8 x bfloat> %shuffle.i
|
|
}
|
|
|
|
define <4 x float> @insertzero_v2f32(<2 x float> %a) {
|
|
; CHECK-LABEL: insertzero_v2f32:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
ret <4 x float> %shuffle.i
|
|
}
|
|
|
|
define <2 x double> @insertzero_v1f64(<1 x double> %a) {
|
|
; CHECK-LABEL: insertzero_v1f64:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov d0, d0
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%shuffle.i = shufflevector <1 x double> %a, <1 x double> zeroinitializer, <2 x i32> <i32 0, i32 1>
|
|
ret <2 x double> %shuffle.i
|
|
}
|
|
|
|
|
|
|
|
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
|
|
declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
|