[AMDGPU][True16][Codegen] remove another build_vector pattern from true16 (#149861)

Remove another build_vector pattern which takes a i16 but placed in a
VGPR_32 from true16 mode. This stop isel from generating illegal
"vgpr_32 = COPY vgpr_16".

ISel will use vgpr16 build vector pattern in true16 mode instead
This commit is contained in:
Brox Chen 2025-09-04 18:08:18 -04:00 committed by GitHub
parent 84b56202fb
commit dfdfc4e490
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
39 changed files with 14851 additions and 13195 deletions

View File

@ -2201,7 +2201,6 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@ -3694,13 +3693,24 @@ def : GCNPat <
>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in
let True16Predicate = p in {
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
(Ty !if(!eq(Ty, i16),
(Ty (trunc (srl VGPR_32:$b, (i32 16)))),
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
>;
}
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@ -3726,18 +3736,6 @@ def : GCNPat <
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
>;
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
(Ty !if(!eq(Ty, i16),
(Ty (trunc (srl VGPR_32:$b, (i32 16)))),
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
>;
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
let True16Predicate = NotHasTrue16BitInsts in {

File diff suppressed because it is too large Load Diff

View File

@ -2668,85 +2668,92 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v8, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h
; GFX11-TRUE16-NEXT: .LBB22_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4i32:
@ -7118,85 +7125,92 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v8, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB46_2: ; %end
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h
; GFX11-TRUE16-NEXT: .LBB46_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4f32:
@ -11216,85 +11230,92 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2i64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v8, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB66_2: ; %end
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h
; GFX11-TRUE16-NEXT: .LBB66_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2i64:
@ -14900,85 +14921,92 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v13, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v8, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB82_2: ; %end
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h
; GFX11-TRUE16-NEXT: .LBB82_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2f64:
@ -21093,24 +21121,30 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@ -21126,55 +21160,53 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v1, v7 :: v_dual_and_b32 v7, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v11, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v7 :: v_dual_add_f32 v7, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v12 :: v_dual_and_b32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v13, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v7, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v9, v4
; GFX11-TRUE16-NEXT: .LBB102_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
; GFX11-TRUE16-NEXT: .LBB102_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8f16:
@ -23752,140 +23784,139 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[0:1]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v11.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v2, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v13, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v9, v13, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_and_b32 v5, 0xffff0000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v8, v10 :: v_dual_and_b32 v1, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v15, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v14, v10, 0x7fff
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v9, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v1, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v13, v7
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11
; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v9, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v0, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v11, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v3, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v13, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v13, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v3, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v11, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v6.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v17
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v12.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v17
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v8.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[21:22]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v21
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v22.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8:

File diff suppressed because it is too large Load Diff

View File

@ -1303,13 +1303,18 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@ -1322,15 +1327,16 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: .LBB14_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32:
@ -3543,13 +3549,18 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@ -3562,15 +3573,16 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: .LBB34_2: ; %end
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: .LBB34_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32:
@ -7051,13 +7063,18 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@ -7070,15 +7087,16 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: .LBB62_2: ; %end
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: .LBB62_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16:
@ -8488,13 +8506,18 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@ -8507,15 +8530,16 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: .LBB72_2: ; %end
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: .LBB72_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32:
@ -9062,15 +9086,14 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: .LBB76_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h

View File

@ -138,46 +138,51 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v3, v7 :: v_dual_and_b32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
; GFX11-TRUE16-NEXT: .LBB0_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0x7fc0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: .LBB0_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16:

File diff suppressed because it is too large Load Diff

View File

@ -2153,56 +2153,56 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h
; GFX11-TRUE16-NEXT: .LBB22_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64:
@ -5288,56 +5288,56 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB46_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h
; GFX11-TRUE16-NEXT: .LBB46_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64:
@ -8135,56 +8135,56 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB66_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h
; GFX11-TRUE16-NEXT: .LBB66_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32:
@ -10655,56 +10655,56 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB82_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h
; GFX11-TRUE16-NEXT: .LBB82_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32:
@ -14628,55 +14628,58 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: .LBB102_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h
; GFX11-TRUE16-NEXT: .LBB102_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16:
@ -16248,78 +16251,78 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v9.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[0:1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v8.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v0 :: v_dual_add_f32 v0, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v9.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v2 :: v_dual_add_f32 v2, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v3, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9]
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v12, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v1, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[8:9]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8:

View File

@ -2075,66 +2075,75 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v3
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v11, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: .LBB10_2: ; %end
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v0, v8 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v9, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
; GFX11-TRUE16-NEXT: .LBB10_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3i32:
@ -5213,66 +5222,75 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v3
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v11, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v0, v8 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v9, v11, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
; GFX11-TRUE16-NEXT: .LBB26_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3f32:
@ -7734,111 +7752,109 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v13, v1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[0:1]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2
; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v13.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, 0x7fc07fc0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v11, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v11, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v4, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v0, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v13, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v7, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v12, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v12, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v0, v8 :: v_dual_add_f32 v0, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v3, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, 0x7fc07fc0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v5, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v2, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[16:17]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16
; GFX11-TRUE16-NEXT: .LBB38_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v16.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v14.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8:
@ -11413,71 +11429,74 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v3
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v3, v1, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v8, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v0, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v9, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v12 :: v_dual_add_f32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: .LBB48_2: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
; GFX11-TRUE16-NEXT: .LBB48_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6f16:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -201,11 +201,6 @@ define <10 x i16> @bitcast_i160_to_v10i16(i160 %int) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v6, v2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: bitcast_i160_to_v10i16:

View File

@ -9127,11 +9127,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -9557,13 +9555,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -10034,6 +10030,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB28_4 Depth 2
@ -10057,9 +10055,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
@ -10090,6 +10086,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_3
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -10734,11 +10731,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -11164,13 +11159,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -11597,11 +11590,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -12027,13 +12018,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -12453,13 +12442,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)

View File

@ -7426,12 +7426,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -7581,11 +7579,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -7955,13 +7951,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -8101,13 +8096,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -8495,9 +8488,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
@ -8728,6 +8720,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
@ -8751,9 +8745,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
@ -8784,6 +8776,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]

View File

@ -7426,12 +7426,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -7581,11 +7579,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -7955,13 +7951,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -8101,13 +8096,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.h
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -8495,9 +8488,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
@ -8728,6 +8720,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
@ -8751,9 +8745,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
@ -8784,6 +8776,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]

View File

@ -476,25 +476,14 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) {
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4f16:
; GFX11-TRUE16-SDAG: ; %bb.0:
; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART
; GFX11-TRUE16-SDAG-NEXT: ; use v[0:1]
; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND
; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-GISEL-LABEL: undef_lo2_v4f16:
; GFX11-TRUE16-GISEL: ; %bb.0:
; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART
; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1]
; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND
; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: undef_lo2_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v[0:1]
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
%undef.lo = shufflevector <2 x half> %arg0, <2 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo);
ret void

View File

@ -0,0 +1,147 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-REAL16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-FAKE16
; Make sure no "vgpr32 = copy vgpr16" is generated
define amdgpu_kernel void @build_vector_bfi (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %out) {
; GFX11-REAL16-LABEL: name: build_vector_bfi
; GFX11-REAL16: bb.0.entry:
; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
; GFX11-REAL16-NEXT: {{ $}}
; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset + 16, align 4, addrspace 4)
; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GFX11-REAL16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1)
; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1)
; GFX11-REAL16-NEXT: [[COPY8:%[0-9]+]]:vgpr_16 = COPY [[GLOBAL_LOAD_DWORD_SADDR]].lo16
; GFX11-REAL16-NEXT: [[COPY9:%[0-9]+]]:vgpr_16 = COPY [[GLOBAL_LOAD_DWORD_SADDR1]].hi16
; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
; GFX11-REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[COPY9]], %subreg.lo16, killed [[V_MOV_B16_t16_e64_]], %subreg.hi16
; GFX11-REAL16-NEXT: [[COPY10:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE3]].lo16
; GFX11-REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[COPY8]], %subreg.lo16, killed [[COPY10]], %subreg.hi16
; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.3, addrspace 1)
; GFX11-REAL16-NEXT: S_ENDPGM 0
;
; GFX11-FAKE16-LABEL: name: build_vector_bfi
; GFX11-FAKE16: bb.0.entry:
; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
; GFX11-FAKE16-NEXT: {{ $}}
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset + 16, align 4, addrspace 4)
; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
; GFX11-FAKE16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GFX11-FAKE16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; GFX11-FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1)
; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1)
; GFX11-FAKE16-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORD_SADDR]]
; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX11-FAKE16-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_2]], killed [[COPY8]], killed [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_BFI_B32_e64_]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s32) into %ir.3, addrspace 1)
; GFX11-FAKE16-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid
%in.gep2 = getelementptr i32, ptr addrspace(1) %b, i32 %tid
%load.i32.a = load i32, ptr addrspace(1) %in.gep1
%load.i32.b = load i32, ptr addrspace(1) %in.gep2
%load.i16.a = trunc i32 %load.i32.a to i16
%load.i32.b.1 = lshr i32 %load.i32.b, 16
%load.i16.b = trunc i32 %load.i32.b.1 to i16
%ins.0 = insertelement <2 x i16> poison, i16 %load.i16.a, i32 0
%ins.1 = insertelement <2 x i16> %ins.0, i16 %load.i16.b, i32 1
store <2 x i16> %ins.1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @build_vector_and (ptr addrspace(1) %a, ptr addrspace(1) %out) {
; GFX11-REAL16-LABEL: name: build_vector_and
; GFX11-REAL16: bb.0.entry:
; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
; GFX11-REAL16-NEXT: {{ $}}
; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep1, addrspace 1)
; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 1, 0, implicit $exec
; GFX11-REAL16-NEXT: [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, killed [[V_MOV_B16_t16_e64_]], 0, 0, implicit $exec
; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_ADD_NC_U16_t16_e64_]], %subreg.lo16, killed [[V_MOV_B16_t16_e64_1]], %subreg.hi16
; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
; GFX11-REAL16-NEXT: S_ENDPGM 0
;
; GFX11-FAKE16-LABEL: name: build_vector_and
; GFX11-FAKE16: bb.0.entry:
; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
; GFX11-FAKE16-NEXT: {{ $}}
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4)
; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep1, addrspace 1)
; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX11-FAKE16-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, killed [[GLOBAL_LOAD_USHORT_SADDR]], 0, killed [[S_MOV_B32_2]], 0, 0, implicit $exec
; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_MOV_B32_e32_1]], killed [[V_ADD_NC_U16_fake16_e64_]], implicit $exec
; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_AND_B32_e64_1]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
; GFX11-FAKE16-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep1 = getelementptr i16, ptr addrspace(1) %a, i32 %tid
%load.i16.a = load i16, ptr addrspace(1) %in.gep1
%add = add i16 %load.i16.a, 1
%ins.0 = insertelement <2 x i16> poison, i16 %add, i32 0
%ins.1 = insertelement <2 x i16> %ins.0, i16 0, i32 1
store <2 x i16> %ins.1, ptr addrspace(1) %out
ret void
}

View File

@ -834,8 +834,9 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p
; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
@ -967,8 +968,9 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@ -1038,11 +1040,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:

View File

@ -3116,23 +3116,21 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test3:
@ -3181,23 +3179,21 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test3:
@ -3325,23 +3321,21 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test4:
@ -3390,23 +3384,21 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test4:

View File

@ -462,11 +462,19 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: divergent_vec_i16_LH:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: divergent_vec_i16_LH:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: divergent_vec_i16_LH:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%shift = lshr i32 %b, 16
%tr = trunc i32 %shift to i16
%tmp = insertelement <2 x i16> poison, i16 %a, i32 0

View File

@ -630,29 +630,28 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_mul_f32 v2, v1, v2 :: v_dual_lshlrev_b32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_mul_f32 v2, v1, v2 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@ -813,31 +812,30 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_and_b32 s2, s4, 0xffff0000
; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s4, 16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, s2, v1
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s4, 16
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
; GFX11-TRUE16-NEXT: s_and_b32 s2, s4, 0xffff0000
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, s2, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;

View File

@ -4045,10 +4045,8 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
@ -4274,10 +4272,8 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3]
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v4
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16:
@ -4439,10 +4435,8 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
@ -5842,29 +5836,26 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16(<3 x float> %ma
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11TRUE16-NEXT: v_add3_u32 v7, v8, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v2, v4
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16:
@ -6124,65 +6115,65 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %m
; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v16, v[4:5]
; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v14, v[0:1]
; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v15, v[2:3]
; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v15, v[4:5]
; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v16, v[2:3]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v15
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v16
; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v15
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v16
; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s4, |v[4:5]|, |v[12:13]|
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s2, |v[0:1]|, |v[8:9]|
; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s3, |v[2:3]|, |v[10:11]|
; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s3, |v[0:1]|, |v[8:9]|
; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[8:9]
; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s1, v[4:5], v[12:13]
; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s0, v[2:3], v[10:11]
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v10, -1, 1, s4
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v8, -1, 1, s2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s0, v[4:5], v[10:11]
; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s1, v[2:3], v[12:13]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v8, -1, 1, s3
; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s3, |v[4:5]|, |v[10:11]|
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, v14, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v9, -1, 1, s3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s3, |v[2:3]|, |v[12:13]|
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, v15, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v10, -1, 1, s3
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, v16, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v14
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, v14, v8
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, v15, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v19
; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s3
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_and_b32 v17, 1, v15
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v17
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s0, s2
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v15 :: v_dual_and_b32 v18, 1, v16
; GFX11TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
; GFX11TRUE16-NEXT: v_add3_u32 v8, v11, v8, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v18
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s1, s4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v9, v12, v9, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
; GFX11TRUE16-NEXT: v_add3_u32 v8, v11, v8, 0x7fff
; GFX11TRUE16-NEXT: s_or_b32 s0, s0, s2
; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0
; GFX11TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v10
; GFX11TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_add3_u32 v9, v12, v9, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3]
; GFX11TRUE16-NEXT: v_add3_u32 v8, v11, v10, 0x7fff
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v14, vcc_lo
; GFX11TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_add3_u32 v10, v14, v10, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v15, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3]
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v2, v6
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16:
@ -6387,29 +6378,26 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11TRUE16-NEXT: v_add3_u32 v7, v8, v4, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo
; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32:
@ -7083,15 +7071,12 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16(<4 x float> %ma
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v4
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v3, v5
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16:
@ -7485,18 +7470,14 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %m
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v21, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[6:7], v[6:7]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3]
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v12, v20, vcc_lo
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v2, v8
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16:
@ -7767,15 +7748,12 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v3
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v4, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32:

View File

@ -18660,7 +18660,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18683,10 +18682,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18698,7 +18696,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB68_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -18986,7 +18983,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19009,10 +19005,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19024,7 +19019,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB69_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -19321,7 +19315,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19343,10 +19336,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19358,7 +19350,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -19658,7 +19649,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19676,13 +19666,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19695,7 +19684,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB71_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -19974,7 +19962,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19992,13 +19979,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -20011,7 +19997,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -20303,7 +20288,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -20321,13 +20305,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -20340,7 +20323,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB73_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -20640,7 +20622,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -20663,10 +20644,9 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -20678,7 +20658,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -20972,7 +20951,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -20990,13 +20968,12 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -21009,7 +20986,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB75_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -21297,7 +21273,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -21320,10 +21295,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -21335,7 +21309,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -21623,7 +21596,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -21641,13 +21613,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -21660,7 +21631,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB77_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -21939,7 +21909,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -21962,10 +21931,9 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -21977,7 +21945,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -22265,7 +22232,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -22283,13 +22249,12 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -22302,7 +22267,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -16192,11 +16192,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -16314,7 +16313,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16337,10 +16335,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16352,7 +16349,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16643,11 +16639,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -16765,7 +16760,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16788,10 +16782,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16803,7 +16796,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -17097,11 +17089,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17229,7 +17220,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17251,10 +17241,9 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17266,7 +17255,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -17564,14 +17552,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17687,7 +17673,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17705,13 +17690,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17724,7 +17708,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18001,14 +17984,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -18124,7 +18105,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18142,13 +18122,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18161,7 +18140,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18445,14 +18423,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -18577,7 +18553,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18595,13 +18570,12 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18614,7 +18588,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18916,11 +18889,10 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19040,7 +19012,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19063,10 +19034,9 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19078,7 +19048,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -19369,14 +19338,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19494,7 +19461,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19512,13 +19478,12 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19531,7 +19496,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -16192,11 +16192,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -16314,7 +16313,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16337,10 +16335,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16352,7 +16349,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16643,11 +16639,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -16765,7 +16760,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16788,10 +16782,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16803,7 +16796,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -17097,11 +17089,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17229,7 +17220,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17251,10 +17241,9 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17266,7 +17255,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -17564,14 +17552,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17687,7 +17673,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17705,13 +17690,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17724,7 +17708,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18001,14 +17984,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -18124,7 +18105,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18142,13 +18122,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18161,7 +18140,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18445,14 +18423,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -18577,7 +18553,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18595,13 +18570,12 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18614,7 +18588,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18916,11 +18889,10 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19040,7 +19012,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19063,10 +19034,9 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19078,7 +19048,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -19369,14 +19338,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19494,7 +19461,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19512,13 +19478,12 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -19531,7 +19496,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -15611,11 +15611,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -15733,7 +15732,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -15756,10 +15754,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -15771,7 +15768,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16062,11 +16058,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -16184,7 +16179,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16207,10 +16201,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16222,7 +16215,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16516,11 +16508,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -16648,7 +16639,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16670,10 +16660,9 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16685,7 +16674,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -16983,14 +16971,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17106,7 +17092,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17124,13 +17109,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17143,7 +17127,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -17420,14 +17403,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17543,7 +17524,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17561,13 +17541,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17580,7 +17559,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -17864,14 +17842,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@ -17996,7 +17972,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18014,13 +17989,12 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18033,7 +18007,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18335,11 +18308,10 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -18459,7 +18431,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18482,10 +18453,9 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18497,7 +18467,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -18788,14 +18757,12 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -18913,7 +18880,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18931,13 +18897,12 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -18950,7 +18915,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -20405,7 +20405,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -20428,10 +20427,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -20443,7 +20441,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -20784,7 +20781,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -20807,10 +20803,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -20822,7 +20817,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -21165,7 +21159,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -21188,10 +21181,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -21203,7 +21195,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB80_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -21550,7 +21541,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -21568,13 +21558,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -21587,7 +21576,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB81_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -21917,7 +21905,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -21935,13 +21922,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -21954,7 +21940,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -22287,7 +22272,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -22305,13 +22289,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -22324,7 +22307,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB83_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -22666,7 +22648,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -22689,10 +22670,9 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -22704,7 +22684,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB84_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -23050,7 +23029,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -23068,13 +23046,12 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -23087,7 +23064,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB85_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -23422,7 +23398,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -23445,10 +23420,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -23460,7 +23434,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -23801,7 +23774,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -23819,13 +23791,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -23838,7 +23809,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB87_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -24168,7 +24138,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -24191,10 +24160,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -24206,7 +24174,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -24547,7 +24514,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -24565,13 +24531,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -24584,7 +24549,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB89_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -24914,7 +24878,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -24937,10 +24900,9 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -24952,7 +24914,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB90_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -25293,7 +25254,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -25311,13 +25271,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -25330,7 +25289,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB91_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -15989,11 +15989,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -16111,7 +16110,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16134,10 +16132,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -16149,7 +16146,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16493,11 +16489,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -16615,7 +16610,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16638,10 +16632,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -16653,7 +16646,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16999,11 +16991,10 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -17121,7 +17112,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17144,10 +17134,9 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -17159,7 +17148,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -17504,14 +17492,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -17627,7 +17613,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17645,13 +17630,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -17664,7 +17648,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -17992,14 +17975,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -18115,7 +18096,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18133,13 +18113,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -18152,7 +18131,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18483,14 +18461,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -18606,7 +18582,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18624,13 +18599,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -18643,7 +18617,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18987,11 +18960,10 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19111,7 +19083,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19134,10 +19105,9 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -19149,7 +19119,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -19492,14 +19461,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19617,7 +19584,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19635,13 +19601,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -19654,7 +19619,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -15989,11 +15989,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -16111,7 +16110,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16134,10 +16132,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -16149,7 +16146,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16493,11 +16489,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -16615,7 +16610,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16638,10 +16632,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -16653,7 +16646,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16999,11 +16991,10 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -17121,7 +17112,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17144,10 +17134,9 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -17159,7 +17148,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -17504,14 +17492,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -17627,7 +17613,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17645,13 +17630,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -17664,7 +17648,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -17992,14 +17975,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -18115,7 +18096,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18133,13 +18113,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -18152,7 +18131,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18483,14 +18461,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -18606,7 +18582,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18624,13 +18599,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -18643,7 +18617,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18987,11 +18960,10 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19111,7 +19083,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19134,10 +19105,9 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -19149,7 +19119,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -19492,14 +19461,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19617,7 +19584,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19635,13 +19601,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -19654,7 +19619,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -16350,11 +16350,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -16472,7 +16471,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16495,10 +16493,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -16510,7 +16507,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -16854,11 +16850,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -16976,7 +16971,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -16999,10 +16993,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -17014,7 +17007,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -17360,11 +17352,10 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -17482,7 +17473,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -17505,10 +17495,9 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -17520,7 +17509,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -17865,14 +17853,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -17988,7 +17974,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18006,13 +17991,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -18025,7 +18009,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18353,14 +18336,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -18476,7 +18457,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18494,13 +18474,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -18513,7 +18492,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -18844,14 +18822,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@ -18967,7 +18943,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -18985,13 +18960,12 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -19004,7 +18978,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -19348,11 +19321,10 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19472,7 +19444,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19495,10 +19466,9 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -19510,7 +19480,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -19853,14 +19822,12 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@ -19978,7 +19945,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -19996,13 +19962,12 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@ -20015,7 +19980,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -246,14 +246,12 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16:
@ -307,15 +305,13 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16:
@ -533,38 +529,36 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16:
@ -612,43 +606,41 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v6, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16:
@ -933,49 +925,44 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_uitofp_v4i1_to_v4bf16:
@ -1033,57 +1020,51 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_uitofp_v4i1_to_v4bf16:
@ -1588,14 +1569,12 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16:
@ -1649,15 +1628,13 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) {
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16:
@ -1877,38 +1854,36 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16:
@ -1956,43 +1931,41 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v6, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16:
@ -2280,49 +2253,44 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, -1.0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_sitofp_v4i1_to_v4bf16:
@ -2380,57 +2348,51 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX12-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, -1.0, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX12-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_sitofp_v4i1_to_v4bf16:

View File

@ -751,19 +751,32 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v2i16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_movk_i32 s2, 0x3e7
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v2i16_0:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0x3e7
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v2i16_0:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_movk_i32 s2, 0x3e7
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@ -929,18 +942,31 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v2i16_0_inlineimm:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 53
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v2i16_0_inlineimm:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@ -1190,19 +1216,32 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v2f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_movk_i32 s2, 0x4500
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v2f16_0:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0x4500
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v2f16_0:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_movk_i32 s2, 0x4500
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
@ -1268,18 +1307,31 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v2f16_0_inlineimm:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 53
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v2f16_0_inlineimm:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
@ -1821,19 +1873,33 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v4f16_0:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v4f16_0:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@ -2004,19 +2070,33 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v4f16_2:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v4f16_2:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@ -2187,19 +2267,33 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4i16_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v4i16_2:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v4i16_2:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@ -2592,19 +2686,33 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v8i16_6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v8i16_6:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v8i16_6:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@ -3087,24 +3195,42 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; CI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v16i16_6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: v_insertelement_v16i16_6:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s4
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v16i16_6:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext

View File

@ -436,9 +436,7 @@ define <2 x bfloat> @v_exp2_v2bf16(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16:
@ -552,9 +550,8 @@ define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16:
@ -659,29 +656,28 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v2, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
@ -774,43 +770,43 @@ define <2 x bfloat> @v_exp2_fneg_v2bf16(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2
; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l
; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16:
@ -929,9 +925,7 @@ define <2 x bfloat> @v_exp2_v2bf16_fast(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast:

View File

@ -6834,7 +6834,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -6857,12 +6856,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -6871,7 +6869,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7191,7 +7188,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7214,12 +7210,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7228,7 +7223,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7549,7 +7543,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7570,11 +7563,10 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -7584,7 +7576,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -7893,7 +7884,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7914,11 +7904,10 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -7928,7 +7917,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -6651,13 +6651,12 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -6771,7 +6770,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -6794,12 +6792,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -6808,7 +6805,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7133,13 +7129,12 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7253,7 +7248,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7276,12 +7270,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7290,7 +7283,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7615,11 +7607,9 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -7730,7 +7720,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7751,11 +7740,10 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -7765,7 +7753,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -8078,11 +8065,9 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8193,7 +8178,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -8214,11 +8198,10 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8228,7 +8211,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -6651,13 +6651,12 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -6771,7 +6770,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -6794,12 +6792,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -6808,7 +6805,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7133,13 +7129,12 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7253,7 +7248,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7276,12 +7270,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7290,7 +7283,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7615,11 +7607,9 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -7730,7 +7720,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7751,11 +7740,10 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -7765,7 +7753,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -8078,11 +8065,9 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8193,7 +8178,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -8214,11 +8198,10 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8228,7 +8211,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -7419,13 +7419,12 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7539,7 +7538,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -7562,12 +7560,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -7576,7 +7573,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -7901,13 +7897,12 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -8021,7 +8016,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -8044,12 +8038,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v5, v4 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
@ -8058,7 +8051,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -8383,11 +8375,9 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8498,7 +8488,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -8519,11 +8508,10 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8533,7 +8521,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -8846,11 +8833,9 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8961,7 +8946,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
@ -8982,11 +8966,10 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
@ -8996,7 +8979,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -1079,9 +1079,7 @@ define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v2bf16_rebroadcast:
@ -1120,12 +1118,10 @@ define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-LABEL: shuffle_v3bf16_rebroadcast:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v3bf16_rebroadcast:
@ -1167,9 +1163,8 @@ define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -1215,9 +1210,8 @@ define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -1267,9 +1261,8 @@ define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@ -1329,9 +1322,8 @@ define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@ -1415,9 +1407,8 @@ define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@ -1487,9 +1478,7 @@ define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v2f16_rebroadcast:
@ -1528,12 +1517,10 @@ define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-LABEL: shuffle_v3f16_rebroadcast:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v3f16_rebroadcast:
@ -1575,9 +1562,8 @@ define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -1623,9 +1609,8 @@ define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@ -1675,9 +1660,8 @@ define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@ -1737,9 +1721,8 @@ define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@ -1823,9 +1806,8 @@ define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0

View File

@ -331,8 +331,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_35u5:
@ -390,12 +389,9 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_357u:
@ -1225,13 +1221,15 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_5734:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_5734:
@ -1482,10 +1480,11 @@ define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_1100:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_1100:
@ -1538,13 +1537,11 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_6161:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -1597,8 +1594,7 @@ define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_2333:
@ -1647,8 +1643,7 @@ define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_6667:
@ -2318,13 +2313,10 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: hi16bits_v2f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: hi16bits_v2f16:
@ -2373,14 +2365,23 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: low16hi16bits_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: low16hi16bits_v2f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: low16hi16bits_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = load <2 x half>, ptr addrspace(1) %x0, align 4
%1 = load <2 x half>, ptr addrspace(1) %x1, align 4
@ -2520,14 +2521,23 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i16_low16hi16bits:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: i16_low16hi16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: i16_low16hi16bits:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
%1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
@ -3617,8 +3627,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_35u5:
@ -3676,12 +3685,9 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u:
@ -4511,13 +4517,15 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_5734:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_5734:
@ -4671,10 +4679,11 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_1100:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_1100:
@ -4727,13 +4736,11 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_6161:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@ -4786,8 +4793,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_2333:
@ -4836,8 +4842,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_6667:
@ -5533,13 +5538,9 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
; GFX11-TRUE16-NEXT: global_store_b64 v6, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@ -5817,13 +5818,10 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-TRUE16-LABEL: hi16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: hi16bits_v2bf16:
@ -5875,10 +5873,10 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
; GFX11-TRUE16-LABEL: low16hi16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: low16hi16bits_v2bf16: