Keep bf16/f16 values encoded as the low half of a 32-bit register, instead of promoting to float. This avoids unwanted FP effects from the fpext/fptrunc which should not be implied by just passing an argument. This also fixes ABI divergence between SelectionDAG and GlobalISel. I've wanted to make this change for ages, and failed the last few times. The main complication was the hack to return shader integer types in SGPRs, which now needs to inspect the underlying IR type.
2668 lines
127 KiB
LLVM
2668 lines
127 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX9
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
|
|
|
|
define bfloat @v_uitofp_i1_to_bf16(i1 %num) {
|
|
; GFX7-LABEL: v_uitofp_i1_to_bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_uitofp_i1_to_bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_uitofp_i1_to_bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_uitofp_i1_to_bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_uitofp_i1_to_bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_uitofp_i1_to_bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = uitofp i1 %num to bfloat
|
|
ret bfloat %op
|
|
}
|
|
|
|
define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
|
|
; GFX7-LABEL: s_uitofp_i1_to_bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_uitofp_i1_to_bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_uitofp_i1_to_bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
|
|
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_uitofp_i1_to_bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
|
|
; GFX12-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = uitofp i1 %num to bfloat
|
|
%b16 = bitcast bfloat %op to i16
|
|
%b32 = zext i16 %b16 to i32
|
|
ret i32 %b32
|
|
}
|
|
|
|
define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
|
|
; GFX7-LABEL: v_uitofp_v2i1_to_v2bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_uitofp_v2i1_to_v2bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_mov_b32 s0, 0x7060302
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
|
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_uitofp_v2i1_to_v2bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_uitofp_v2i1_to_v2bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = uitofp <2 x i1> %num to <2 x bfloat>
|
|
ret <2 x bfloat> %op
|
|
}
|
|
|
|
define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
|
|
; GFX7-LABEL: s_uitofp_v2i1_to_v2bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_and_b32 s2, 1, s1
|
|
; GFX7-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_uitofp_v2i1_to_v2bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_and_b32 s2, 1, s1
|
|
; GFX9-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_uitofp_v2i1_to_v2bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
|
|
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_uitofp_v2i1_to_v2bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v1
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX12-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = uitofp <2 x i1> %num to <2 x bfloat>
|
|
%b16 = bitcast <2 x bfloat> %op to <2 x i16>
|
|
%b32 = zext <2 x i16> %b16 to <2 x i32>
|
|
ret <2 x i32> %b32
|
|
}
|
|
|
|
define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
|
|
; GFX7-LABEL: v_uitofp_v3i1_to_v3bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v3, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 1, v2
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_uitofp_v3i1_to_v3bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_mov_b32 s0, 0x7060302
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
|
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
|
; GFX9-NEXT: v_alignbit_b32 v1, s0, v2, 16
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_uitofp_v3i1_to_v3bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_uitofp_v3i1_to_v3bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = uitofp <3 x i1> %num to <3 x bfloat>
|
|
ret <3 x bfloat> %op
|
|
}
|
|
|
|
define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
|
|
; GFX7-LABEL: s_uitofp_v3i1_to_v3bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_and_b32 s4, 1, s2
|
|
; GFX7-NEXT: s_and_b32 s2, 1, s1
|
|
; GFX7-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v0
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_uitofp_v3i1_to_v3bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_and_b32 s4, 1, s2
|
|
; GFX9-NEXT: s_and_b32 s2, 1, s1
|
|
; GFX9-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[4:5]
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v4, v4, v2
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v2
|
|
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_uitofp_v3i1_to_v3bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX11-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
|
|
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
|
|
; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
|
|
; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
|
|
; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_uitofp_v3i1_to_v3bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX12-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
|
|
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
|
|
; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1
|
|
; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1
|
|
; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = uitofp <3 x i1> %num to <3 x bfloat>
|
|
%b16 = bitcast <3 x bfloat> %op to <3 x i16>
|
|
%b32 = zext <3 x i16> %b16 to <3 x i32>
|
|
ret <3 x i32> %b32
|
|
}
|
|
|
|
define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) {
|
|
; GFX7-LABEL: v_uitofp_v4i1_to_v4bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
|
; GFX7-NEXT: v_alignbit_b32 v1, v3, v2, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_uitofp_v4i1_to_v4bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v2, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v1, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_mov_b32 s0, 0x7060302
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
|
|
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
|
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_uitofp_v4i1_to_v4bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_uitofp_v4i1_to_v4bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_uitofp_v4i1_to_v4bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_uitofp_v4i1_to_v4bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = uitofp <4 x i1> %num to <4 x bfloat>
|
|
ret <4 x bfloat> %op
|
|
}
|
|
|
|
define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
|
|
; GFX7-LABEL: s_uitofp_v4i1_to_v4bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_and_b32 s6, 1, s3
|
|
; GFX7-NEXT: s_and_b32 s4, 1, s2
|
|
; GFX7-NEXT: s_and_b32 s2, 1, s1
|
|
; GFX7-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s6, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[6:7]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v3
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_uitofp_v4i1_to_v4bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_and_b32 s6, 1, s3
|
|
; GFX9-NEXT: s_and_b32 s4, 1, s2
|
|
; GFX9-NEXT: s_and_b32 s2, 1, s1
|
|
; GFX9-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s6, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[6:7]
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
|
|
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v4, v4, v2
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v2
|
|
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v5, v5, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3
|
|
; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v3
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_uitofp_v4i1_to_v4bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_and_b32 s3, 1, s3
|
|
; GFX11-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX11-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
|
|
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s1
|
|
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s3, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2
|
|
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
|
|
; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3
|
|
; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
|
|
; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
|
|
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
|
|
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_uitofp_v4i1_to_v4bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_and_b32 s3, 1, s3
|
|
; GFX12-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX12-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
|
|
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s1
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s3, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2
|
|
; GFX12-NEXT: s_cselect_b32 s3, -1, 0
|
|
; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3
|
|
; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
|
|
; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v4
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4
|
|
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v3
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v3
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX12-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
|
|
; GFX12-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = uitofp <4 x i1> %num to <4 x bfloat>
|
|
%b16 = bitcast <4 x bfloat> %op to <4 x i16>
|
|
%b32 = zext <4 x i16> %b16 to <4 x i32>
|
|
ret <4 x i32> %b32
|
|
}
|
|
|
|
define bfloat @v_sitofp_i1_to_bf16(i1 %num) {
|
|
; GFX7-LABEL: v_sitofp_i1_to_bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_sitofp_i1_to_bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_sitofp_i1_to_bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_sitofp_i1_to_bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_sitofp_i1_to_bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_sitofp_i1_to_bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = sitofp i1 %num to bfloat
|
|
ret bfloat %op
|
|
}
|
|
|
|
define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
|
|
; GFX7-LABEL: s_sitofp_i1_to_bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_sitofp_i1_to_bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_sitofp_i1_to_bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
|
|
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_sitofp_i1_to_bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
|
|
; GFX12-NEXT: v_bfe_u32 v1, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = sitofp i1 %num to bfloat
|
|
%b16 = bitcast bfloat %op to i16
|
|
%b32 = sext i16 %b16 to i32
|
|
ret i32 %b32
|
|
}
|
|
|
|
define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) {
|
|
; GFX7-LABEL: v_sitofp_v2i1_to_v2bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_sitofp_v2i1_to_v2bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_mov_b32 s0, 0x7060302
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
|
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_sitofp_v2i1_to_v2bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_sitofp_v2i1_to_v2bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = sitofp <2 x i1> %num to <2 x bfloat>
|
|
ret <2 x bfloat> %op
|
|
}
|
|
|
|
define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
|
|
; GFX7-LABEL: s_sitofp_v2i1_to_v2bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_and_b32 s2, 1, s0
|
|
; GFX7-NEXT: s_bitcmp1_b32 s1, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[0:1]
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_sitofp_v2i1_to_v2bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_and_b32 s2, 1, s0
|
|
; GFX9-NEXT: s_bitcmp1_b32 s1, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_sitofp_v2i1_to_v2bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_and_b32 s0, 1, s0
|
|
; GFX11-NEXT: s_bitcmp1_b32 s1, 0
|
|
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s0, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
|
|
; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_sitofp_v2i1_to_v2bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_and_b32 s0, 1, s0
|
|
; GFX12-NEXT: s_bitcmp1_b32 s1, 0
|
|
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s0, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX12-NEXT: v_bfe_u32 v2, v1, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = sitofp <2 x i1> %num to <2 x bfloat>
|
|
%b16 = bitcast <2 x bfloat> %op to <2 x i16>
|
|
%b32 = sext <2 x i16> %b16 to <2 x i32>
|
|
ret <2 x i32> %b32
|
|
}
|
|
|
|
define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) {
|
|
; GFX7-LABEL: v_sitofp_v3i1_to_v3bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v3, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 1, v2
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_alignbit_b32 v0, v2, v0, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_sitofp_v3i1_to_v3bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v2, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v1, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_mov_b32 s0, 0x7060302
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
|
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
|
; GFX9-NEXT: v_alignbit_b32 v1, s0, v2, 16
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_sitofp_v3i1_to_v3bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_sitofp_v3i1_to_v3bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = sitofp <3 x i1> %num to <3 x bfloat>
|
|
ret <3 x bfloat> %op
|
|
}
|
|
|
|
define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
|
|
; GFX7-LABEL: s_sitofp_v3i1_to_v3bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_and_b32 s4, 1, s0
|
|
; GFX7-NEXT: s_and_b32 s3, 1, s1
|
|
; GFX7-NEXT: s_bitcmp1_b32 s2, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s3, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s[0:1]
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_sitofp_v3i1_to_v3bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_and_b32 s4, 1, s0
|
|
; GFX9-NEXT: s_and_b32 s3, 1, s1
|
|
; GFX9-NEXT: s_bitcmp1_b32 s2, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s3, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[4:5]
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[2:3]
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v4, v4, v2
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v2
|
|
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_sitofp_v3i1_to_v3bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_and_b32 s0, 1, s0
|
|
; GFX11-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX11-NEXT: s_bitcmp1_b32 s2, 0
|
|
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s2
|
|
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s0, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
|
|
; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
|
; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
|
|
; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_sitofp_v3i1_to_v3bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_and_b32 s0, 1, s0
|
|
; GFX12-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX12-NEXT: s_bitcmp1_b32 s2, 0
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s2
|
|
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s0, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
|
|
; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
|
; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v2
|
|
; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = sitofp <3 x i1> %num to <3 x bfloat>
|
|
%b16 = bitcast <3 x bfloat> %op to <3 x i16>
|
|
%b32 = sext <3 x i16> %b16 to <3 x i32>
|
|
ret <3 x i32> %b32
|
|
}
|
|
|
|
define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) {
|
|
; GFX7-LABEL: v_sitofp_v4i1_to_v4bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
|
; GFX7-NEXT: v_alignbit_b32 v1, v3, v2, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_sitofp_v4i1_to_v4bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
|
|
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v2, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v3, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v0, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
|
|
; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1
|
|
; GFX9-NEXT: v_add3_u32 v4, v4, v1, s0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: s_mov_b32 s0, 0x7060302
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
|
|
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
|
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_sitofp_v4i1_to_v4bf16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
|
|
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_sitofp_v4i1_to_v4bf16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
|
|
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
|
|
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-TRUE16-LABEL: v_sitofp_v4i1_to_v4bf16:
|
|
; GFX12-TRUE16: ; %bb.0:
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
|
|
; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
|
|
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-FAKE16-LABEL: v_sitofp_v4i1_to_v4bf16:
|
|
; GFX12-FAKE16: ; %bb.0:
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
|
|
; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
|
|
; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
|
|
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%op = sitofp <4 x i1> %num to <4 x bfloat>
|
|
ret <4 x bfloat> %op
|
|
}
|
|
|
|
define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
|
|
; GFX7-LABEL: s_sitofp_v4i1_to_v4bf16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_and_b32 s6, 1, s0
|
|
; GFX7-NEXT: s_and_b32 s4, 1, s1
|
|
; GFX7-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX7-NEXT: s_bitcmp1_b32 s3, 0
|
|
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX7-NEXT: s_cmp_eq_u32 s6, 1
|
|
; GFX7-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[6:7]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s[0:1]
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 16, v3
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s3, v3
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_sitofp_v4i1_to_v4bf16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_and_b32 s6, 1, s0
|
|
; GFX9-NEXT: s_and_b32 s4, 1, s1
|
|
; GFX9-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX9-NEXT: s_bitcmp1_b32 s3, 0
|
|
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
|
; GFX9-NEXT: s_cmp_eq_u32 s6, 1
|
|
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[6:7]
|
|
; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[4:5]
|
|
; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
|
|
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s[2:3]
|
|
; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v4, v4, v2
|
|
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v2
|
|
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s[0:1]
|
|
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
|
|
; GFX9-NEXT: v_add_u32_e32 v5, v5, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v3
|
|
; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
|
|
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v3
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s3, v3
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_sitofp_v4i1_to_v4bf16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_and_b32 s0, 1, s0
|
|
; GFX11-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX11-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX11-NEXT: s_bitcmp1_b32 s3, 0
|
|
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s3
|
|
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s2
|
|
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX11-NEXT: s_cmp_eq_u32 s0, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s1
|
|
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
|
|
; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo
|
|
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
|
|
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
|
|
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s1, v2
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v3
|
|
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s2, v3
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_sitofp_v4i1_to_v4bf16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_and_b32 s0, 1, s0
|
|
; GFX12-NEXT: s_and_b32 s1, 1, s1
|
|
; GFX12-NEXT: s_and_b32 s2, 1, s2
|
|
; GFX12-NEXT: s_bitcmp1_b32 s3, 0
|
|
; GFX12-NEXT: s_cselect_b32 s3, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s2, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s3
|
|
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s2
|
|
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s0, 1
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s1
|
|
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
|
; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
|
|
; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
|
|
; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
|
|
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v0
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v3
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo
|
|
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
|
|
; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
|
|
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v2
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
|
|
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s1, v2
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 16, v3
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s2, v3
|
|
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_readfirstlane_b32 s3, v1
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
%op = sitofp <4 x i1> %num to <4 x bfloat>
|
|
%b16 = bitcast <4 x bfloat> %op to <4 x i16>
|
|
%b32 = sext <4 x i16> %b16 to <4 x i32>
|
|
ret <4 x i32> %b32
|
|
}
|