[TargetLowering] Be more efficient in fp -> bf16 NaN conversions
We can avoid masking completely as it is OK (and probably preferable) to bring over some of the existant NaN payload.
This commit is contained in:
parent
d17eade22a
commit
be36812fb7
@ -10948,12 +10948,10 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const {
|
||||
Op = expandRoundInexactToOdd(F32, Op, dl, DAG);
|
||||
Op = DAG.getNode(ISD::BITCAST, dl, I32, Op);
|
||||
|
||||
// Extract the sign bit and exponent.
|
||||
SDValue SignBitAndExponentField = DAG.getNode(
|
||||
ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32));
|
||||
// Set the quiet bit.
|
||||
SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField,
|
||||
DAG.getConstant(0x400000, dl, I32));
|
||||
// Conversions should set NaN's quiet bit. This also prevents NaNs from
|
||||
// turning into infinities.
|
||||
SDValue NaN =
|
||||
DAG.getNode(ISD::OR, dl, I32, Op, DAG.getConstant(0x400000, dl, I32));
|
||||
|
||||
// Factor in the contribution of the low 16 bits.
|
||||
SDValue One = DAG.getConstant(1, dl, I32);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -790,8 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
@ -806,9 +805,8 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
|
||||
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
|
||||
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
|
||||
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0
|
||||
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
|
||||
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2
|
||||
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
|
@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
|
||||
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
|
||||
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
|
||||
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
|
||||
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
|
||||
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
|
||||
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
|
||||
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
|
||||
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
|
||||
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_and_b32 s0, s2, -4
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
|
||||
@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
|
||||
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_and_b32 s0, s2, -4
|
||||
@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
|
||||
@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
|
||||
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
|
||||
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
|
||||
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
|
||||
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
|
||||
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
|
||||
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
|
||||
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
|
||||
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
|
||||
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_and_b32 s0, s2, -4
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
|
||||
@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
|
||||
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_and_b32 s0, s2, -4
|
||||
@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
|
||||
; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
|
||||
|
@ -912,10 +912,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
|
||||
; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
|
||||
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
|
||||
@ -934,10 +933,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
|
||||
; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
|
||||
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
|
||||
@ -956,10 +954,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
|
||||
; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
|
||||
; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
|
||||
@ -978,10 +975,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
|
||||
; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
|
||||
; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
|
||||
|
@ -1413,9 +1413,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
|
||||
; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
|
||||
; VI-NEXT: v_bfe_u32 v6, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
|
||||
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v3
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
|
||||
; VI-NEXT: v_and_b32_e32 v5, v4, v2
|
||||
@ -1451,9 +1450,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
|
||||
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3
|
||||
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
||||
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6
|
||||
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
@ -1560,9 +1558,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
|
||||
; VI-NEXT: v_add_f32_e32 v4, 4.0, v4
|
||||
; VI-NEXT: v_bfe_u32 v6, v4, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4
|
||||
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v4
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
|
||||
; VI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
|
||||
; VI-NEXT: v_and_b32_e32 v5, v3, v2
|
||||
@ -1597,9 +1594,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4
|
||||
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4
|
||||
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4
|
||||
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6
|
||||
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
|
@ -4259,65 +4259,57 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7
|
||||
; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1
|
||||
; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
|
||||
; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2
|
||||
; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7
|
||||
; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
|
||||
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7
|
||||
; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1
|
||||
; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
||||
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1
|
||||
; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8
|
||||
; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v8
|
||||
; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2
|
||||
; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
|
||||
; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v16
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc
|
||||
; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2
|
||||
; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
|
||||
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc
|
||||
; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
|
||||
; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1
|
||||
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
|
||||
; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1
|
||||
; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5
|
||||
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
|
||||
; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2
|
||||
; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7
|
||||
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1
|
||||
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
|
||||
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3
|
||||
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3
|
||||
; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2
|
||||
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2
|
||||
; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1
|
||||
; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4
|
||||
; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v4
|
||||
; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2
|
||||
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
|
||||
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc
|
||||
@ -4332,7 +4324,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
|
||||
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||||
; GFX10-NEXT: s_mov_b32 s2, 0xff800000
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x2
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
|
||||
@ -4355,20 +4346,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4
|
||||
; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1
|
||||
; GFX10-NEXT: v_and_or_b32 v8, v7, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7
|
||||
; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
|
||||
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
|
||||
; GFX10-NEXT: v_and_or_b32 v12, v0, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
|
||||
; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
|
||||
; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1
|
||||
; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
|
||||
; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1
|
||||
; GFX10-NEXT: v_and_or_b32 v16, v1, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
|
||||
; GFX10-NEXT: v_and_or_b32 v14, v11, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
|
||||
; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
@ -4382,7 +4373,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
|
||||
; GFX10-NEXT: v_and_or_b32 v8, v4, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
|
||||
; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10
|
||||
; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
|
||||
@ -4390,14 +4381,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
||||
; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
|
||||
; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v0, s2, 0x400000
|
||||
; GFX10-NEXT: v_and_or_b32 v10, v1, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
|
||||
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
|
||||
; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
|
||||
; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
|
||||
; GFX10-NEXT: v_and_or_b32 v12, v7, s2, 0x400000
|
||||
; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
|
||||
; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
|
||||
; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
|
||||
@ -4416,7 +4407,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
|
||||
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
|
||||
; GFX11-NEXT: s_mov_b32 s0, 0xff800000
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_clause 0x2
|
||||
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]
|
||||
@ -4438,11 +4428,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
|
||||
; GFX11-NEXT: v_and_or_b32 v14, v11, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
|
||||
; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9
|
||||
; GFX11-NEXT: v_and_or_b32 v16, v1, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1
|
||||
; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
|
||||
@ -4450,11 +4440,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4
|
||||
; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
|
||||
; GFX11-NEXT: v_and_or_b32 v8, v7, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
|
||||
; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
|
||||
; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0
|
||||
; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
|
||||
@ -4466,7 +4456,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16
|
||||
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
@ -4480,14 +4470,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
|
||||
; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
|
||||
; GFX11-NEXT: v_and_or_b32 v12, v7, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
|
||||
; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10
|
||||
; GFX11-NEXT: v_and_or_b32 v10, v1, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1
|
||||
; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
|
||||
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000
|
||||
; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
|
||||
|
Loading…
x
Reference in New Issue
Block a user