[AMDGPU] True16 support for bf16 clamp pattern on gfx1250 (#190036)
This commit is contained in:
parent
c6669c4993
commit
a9df7c7186
@ -2044,9 +2044,17 @@ let SubtargetPredicate = UseRealTrue16Insts in
|
||||
def : ClampPat<V_MAX_F16_t16_e64, f16>;
|
||||
let SubtargetPredicate = UseFakeTrue16Insts in
|
||||
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
|
||||
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
|
||||
let True16Predicate = UseFakeTrue16Insts in
|
||||
def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
|
||||
let True16Predicate = UseRealTrue16Insts,
|
||||
SubtargetPredicate = HasBF16PackedInsts in
|
||||
def : GCNPat<(bf16 (AMDGPUclamp (VOP3Mods bf16:$src0, i32:$src0_modifiers))),
|
||||
(EXTRACT_SUBREG
|
||||
(V_PK_MAX_NUM_BF16
|
||||
$src0_modifiers, (REG_SEQUENCE VGPR_32, $src0, lo16, (bf16 (IMPLICIT_DEF)), hi16),
|
||||
$src0_modifiers, (REG_SEQUENCE VGPR_32, $src0, lo16, (bf16 (IMPLICIT_DEF)), hi16),
|
||||
DSTCLAMP.ENABLE), lo16)
|
||||
>;
|
||||
|
||||
let SubtargetPredicate = HasVOP3PInsts in {
|
||||
def : GCNPat <
|
||||
|
||||
@ -1,26 +1,41 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,TRUE16 %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,FAKE16 %s
|
||||
|
||||
; TODO: Add global-isel when it can support bf16
|
||||
define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) {
|
||||
; GCN-LABEL: llvm_sqrt_bf16_v:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_sqrt_bf16_e32 v2, v2
|
||||
; GCN-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; TRUE16-LABEL: llvm_sqrt_bf16_v:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_sqrt_bf16_e32 v2.l, v2.l
|
||||
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; FAKE16-LABEL: llvm_sqrt_bf16_v:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_sqrt_bf16_e32 v2, v2
|
||||
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; FAKE16-NEXT: s_endpgm
|
||||
%sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
|
||||
store bfloat %sqrt, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
|
||||
; GCN-LABEL: llvm_sqrt_bf16_s:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_sqrt_bf16_e32 v2, s0
|
||||
; GCN-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; TRUE16-LABEL: llvm_sqrt_bf16_s:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_sqrt_bf16_e32 v2.l, s0
|
||||
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; FAKE16-LABEL: llvm_sqrt_bf16_s:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_sqrt_bf16_e32 v2, s0
|
||||
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; FAKE16-NEXT: s_endpgm
|
||||
%sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
|
||||
store bfloat %sqrt, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
@ -353,27 +368,94 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
|
||||
}
|
||||
|
||||
define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
|
||||
; GCN-LABEL: test_clamp_bf16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
; TRUE16-LABEL: test_clamp_bf16:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
|
||||
; TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; FAKE16-LABEL: test_clamp_bf16:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
|
||||
; FAKE16-NEXT: ; return to shader part epilog
|
||||
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
|
||||
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
|
||||
ret bfloat %clamp
|
||||
}
|
||||
|
||||
define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
|
||||
; GCN-LABEL: test_clamp_bf16_s:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
; TRUE16-LABEL: test_clamp_bf16_s:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
|
||||
; TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
|
||||
; TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; FAKE16-LABEL: test_clamp_bf16_s:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
|
||||
; FAKE16-NEXT: ; return to shader part epilog
|
||||
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
|
||||
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
|
||||
ret bfloat %clamp
|
||||
}
|
||||
|
||||
define amdgpu_ps float @test_clamp_bf16_hi16(<2 x bfloat> %src) {
|
||||
; TRUE16-LABEL: test_clamp_bf16_hi16:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
|
||||
; TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; TRUE16-NEXT: v_pk_max_num_bf16 v1, v0, v0 op_sel_hi:[0,0] clamp
|
||||
; TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
|
||||
; TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; FAKE16-LABEL: test_clamp_bf16_hi16:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; FAKE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
|
||||
; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; FAKE16-NEXT: ; return to shader part epilog
|
||||
%bsrc = extractelement <2 x bfloat> %src, i32 1
|
||||
%max = call bfloat @llvm.maxnum.bf16(bfloat %bsrc, bfloat 0.0)
|
||||
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
|
||||
%res = fpext bfloat %clamp to float
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_ps float @test_clamp_bf16_hi16_s(<2 x bfloat> inreg %src) {
|
||||
; TRUE16-LABEL: test_clamp_bf16_hi16_s:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
|
||||
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
|
||||
; TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; FAKE16-LABEL: test_clamp_bf16_hi16_s:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; FAKE16-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
|
||||
; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; FAKE16-NEXT: ; return to shader part epilog
|
||||
%bsrc = extractelement <2 x bfloat> %src, i32 1
|
||||
%max = call bfloat @llvm.maxnum.bf16(bfloat %bsrc, bfloat 0.0)
|
||||
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
|
||||
%res = fpext bfloat %clamp to float
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
|
||||
; GCN-LABEL: test_clamp_v2bf16:
|
||||
; GCN: ; %bb.0:
|
||||
@ -399,14 +481,23 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
|
||||
}
|
||||
|
||||
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
|
||||
; GCN-LABEL: test_clamp_bf16_folding:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_exp_bf16_e32 v0, v0
|
||||
; GCN-NEXT: v_nop
|
||||
; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
; TRUE16-LABEL: test_clamp_bf16_folding:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_exp_bf16_e32 v0.l, v0.l
|
||||
; TRUE16-NEXT: v_nop
|
||||
; TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; TRUE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp
|
||||
; TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; FAKE16-LABEL: test_clamp_bf16_folding:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_exp_bf16_e32 v0, v0
|
||||
; FAKE16-NEXT: v_nop
|
||||
; FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
|
||||
; FAKE16-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
|
||||
; FAKE16-NEXT: ; return to shader part epilog
|
||||
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
|
||||
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
|
||||
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
|
||||
@ -560,48 +651,76 @@ define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat>
|
||||
}
|
||||
|
||||
define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
|
||||
; GCN-LABEL: llvm_log2_bf16_v:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_log_bf16_e32 v2, v2
|
||||
; GCN-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; TRUE16-LABEL: llvm_log2_bf16_v:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_log_bf16_e32 v2.l, v2.l
|
||||
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; FAKE16-LABEL: llvm_log2_bf16_v:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_log_bf16_e32 v2, v2
|
||||
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; FAKE16-NEXT: s_endpgm
|
||||
%log = call bfloat @llvm.log2.bf16(bfloat %src)
|
||||
store bfloat %log, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
|
||||
; GCN-LABEL: llvm_log2_bf16_s:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_log_bf16_e32 v2, s0
|
||||
; GCN-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; TRUE16-LABEL: llvm_log2_bf16_s:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_log_bf16_e32 v2.l, s0
|
||||
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; FAKE16-LABEL: llvm_log2_bf16_s:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_log_bf16_e32 v2, s0
|
||||
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; FAKE16-NEXT: s_endpgm
|
||||
%log = call bfloat @llvm.log2.bf16(bfloat %src)
|
||||
store bfloat %log, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
|
||||
; GCN-LABEL: llvm_exp2_bf16_v:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_exp_bf16_e32 v2, v2
|
||||
; GCN-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; TRUE16-LABEL: llvm_exp2_bf16_v:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_exp_bf16_e32 v2.l, v2.l
|
||||
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; FAKE16-LABEL: llvm_exp2_bf16_v:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_exp_bf16_e32 v2, v2
|
||||
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; FAKE16-NEXT: s_endpgm
|
||||
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
|
||||
store bfloat %exp, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
|
||||
; GCN-LABEL: llvm_exp2_bf16_s:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GCN-NEXT: v_exp_bf16_e32 v2, s0
|
||||
; GCN-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; TRUE16-LABEL: llvm_exp2_bf16_s:
|
||||
; TRUE16: ; %bb.0:
|
||||
; TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; TRUE16-NEXT: v_exp_bf16_e32 v2.l, s0
|
||||
; TRUE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; FAKE16-LABEL: llvm_exp2_bf16_s:
|
||||
; FAKE16: ; %bb.0:
|
||||
; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; FAKE16-NEXT: v_exp_bf16_e32 v2, s0
|
||||
; FAKE16-NEXT: global_store_b16 v[0:1], v2, off
|
||||
; FAKE16-NEXT: s_endpgm
|
||||
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
|
||||
store bfloat %exp, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user