This patch adds: 1. Support to recognize bf16 type in the frontend and isel/abi support for scalar bf16 programs Limitations: fp_to_bf16 is being generated with a tablegen pattern instead of lowering via expansion. This is because we do not have support for fcanonincalize instruction which should prevent an SNaN being converted to an infinity due to truncation. 2. Vector codegen support for bf16 Patch By: Fateme Hosseini Co-authored-by: Muntasir Mallick <quic_mallick@quicinc.com> Co-authored-by: Muntasir Mallick <mallick@qti.qualcomm.com> Co-authored-by: Kaushik Kulkarni <quic_kauskulk@quicinc.com>
117 lines
4.0 KiB
LLVM
117 lines
4.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=hexagon --mattr=+hvx-length128b,+hvx-qfloat,+hvxv81,+v81 < %s | FileCheck %s
|
|
|
|
; Function Attrs: mustprogress nounwind
|
|
|
|
define dso_local void @bf16_vec_add(ptr noundef %c, ptr noundef %a, ptr noundef %b) local_unnamed_addr #0 {
|
|
; CHECK-LABEL: bf16_vec_add:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[R7:r[0-9]+]] = #-4
|
|
; CHECK-NEXT: [[V0:v[0-9]+]] = vmemu([[R2:r[0-9]+]]+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[R2]] = ##32768
|
|
; CHECK-NEXT: [[V1:v[0-9]+]] = vmemu([[R1:r[0-9]+]]+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[R6:r[0-9]+]] = ##131071
|
|
; CHECK-NEXT: [[V2:v[0-9]+]] = vxor([[V0]],[[V0]])
|
|
; CHECK-NEXT: [[V3:v[0-9]+]] = vxor([[V1]],[[V1]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V25:v[0-9]+]] = vsplat([[R2]])
|
|
; CHECK-NEXT: [[R5:r[0-9]+]] = #16
|
|
; CHECK-NEXT: [[V5_4:v[0-9]+:[0-9]+]].h = vshuffoe([[V0]].h,[[V2]].h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V26:v[0-9]+]] = vsplat([[R6]])
|
|
; CHECK-NEXT: [[R4:r[0-9]+]] = #32767
|
|
; CHECK-NEXT: [[V31_30:v[0-9]+:[0-9]+]].h = vshuffoe([[V1]].h,[[V3]].h)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V5_4]] = vshuff([[V5:v[0-9]+]],[[V4:v[0-9]+]],[[R7]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V31:v[0-9]+]].h = vsplat([[R4]])
|
|
; CHECK-NEXT: [[V3_2:v[0-9]+:[0-9]+]] = vshuff([[V31]],[[V30:v[0-9]+]],[[R7]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V2]].qf32 = vadd([[V2]].sf,[[V4]].sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V3]].qf32 = vadd([[V3]].sf,[[V5]].sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V2]].sf = [[V2]].qf32
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V3]].sf = [[V3]].qf32
|
|
; CHECK-NEXT: [[V27:v[0-9]+]] = vand([[V2]],[[V25]])
|
|
; CHECK-NEXT: [[V28:v[0-9]+]] = vand([[V2]],[[V26]])
|
|
; CHECK-NEXT: [[Q2:q[0-9]+]] = vcmp.eq([[V2]].sf,[[V2]].sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V29:v[0-9]+]] = vand([[V3]],[[V25]])
|
|
; CHECK-NEXT: [[V1]] = vand([[V3]],[[V26]])
|
|
; CHECK-NEXT: [[Q0:q[0-9]+]] = vcmp.eq([[V28]].w,[[V25]].w)
|
|
; CHECK-NEXT: [[V4]].w = vadd([[V2]].w,[[V27]].w)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V5]].w = vadd([[V3]].w,[[V29]].w)
|
|
; CHECK-NEXT: [[Q1:q[0-9]+]] = vcmp.eq([[V1]].w,[[V25]].w)
|
|
; CHECK-NEXT: [[V30:v[0-9]+]] = vmux([[Q0]],[[V2]],[[V4]])
|
|
; CHECK-NEXT: [[Q3:q[0-9]+]] = vcmp.eq([[V3]].sf,[[V3]].sf)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V1]] = vmux([[Q1]],[[V3]],[[V5]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V0]].uw = vlsr([[V30]].uw,[[R5]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V1]].uw = vlsr([[V1]].uw,[[R5]])
|
|
; CHECK-NEXT: [[V0]] = vmux([[Q2]],[[V0]],[[V31]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V1]] = vmux([[Q3]],[[V1]],[[V31]])
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: [[V0]].uh = vpack([[V1]].w,[[V0]].w):sat
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr [[R31:r[0-9]+]]
|
|
; CHECK-NEXT: vmemu([[R0:r[0-9]+]]+#0) = [[V0]]
|
|
; CHECK-NEXT: }
|
|
|
|
|
|
entry:
|
|
%0 = load <64 x bfloat>, ptr %a, align 2
|
|
%1 = load <64 x bfloat>, ptr %b, align 2
|
|
%add.ripple.vectorized = fadd <64 x bfloat> %0, %1
|
|
store <64 x bfloat> %add.ripple.vectorized, ptr %c, align 2
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @copy1d(ptr noundef readonly captures(none) %X, ptr noundef writeonly captures(none) %Y) local_unnamed_addr #0 {
|
|
; CHECK-LABEL: copy1d:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v0 = vmemu(r0+#1)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: v1 = vmemu(r0+#0)
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: vmemu(r1+#1) = v0
|
|
; CHECK-NEXT: }
|
|
; CHECK-NEXT: {
|
|
; CHECK-NEXT: jumpr r31
|
|
; CHECK-NEXT: vmemu(r1+#0) = v1
|
|
; CHECK-NEXT: }
|
|
entry:
|
|
%0 = load <128 x half>, ptr %X, align 2
|
|
store <128 x half> %0, ptr %Y, align 2
|
|
ret void
|
|
}
|