AMDGPU: Sign extend immediates for 32-bit subregister extracts (#154870)

extractSubregFromImm previously would sign extend the 16-bit subregister
extracts, but not the 32-bit. We try to consistently store immediates
as sign extended, since not doing it can result in misreported
isInlineImmediate checks.
This commit is contained in:
Matt Arsenault 2025-08-22 16:50:36 +09:00 committed by GitHub
parent e0945dfa30
commit 2b46f31ee3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 264 additions and 276 deletions

View File

@ -3475,9 +3475,9 @@ std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
case AMDGPU::NoSubRegister:
return Imm;
case AMDGPU::sub0:
return Lo_32(Imm);
return SignExtend64<32>(Imm);
case AMDGPU::sub1:
return Hi_32(Imm);
return SignExtend64<32>(Imm >> 32);
case AMDGPU::lo16:
return SignExtend64<16>(Imm);
case AMDGPU::hi16:

View File

@ -503,9 +503,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
@ -2827,9 +2828,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
@ -16759,14 +16761,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -16816,9 +16815,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
@ -16841,9 +16841,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
@ -17331,13 +17332,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -17386,9 +17385,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
@ -17411,9 +17411,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
@ -19313,16 +19314,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start
@ -19367,16 +19365,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start
@ -19463,9 +19458,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -19506,9 +19502,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -20299,15 +20296,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start
@ -20352,15 +20347,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start
@ -20446,9 +20439,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@ -20489,9 +20483,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2

View File

@ -382,9 +382,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@ -409,9 +410,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@ -836,9 +838,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@ -863,9 +866,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@ -1936,9 +1940,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@ -1963,9 +1968,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@ -2390,9 +2396,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@ -2417,9 +2424,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@ -14767,15 +14775,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -14828,9 +14833,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
@ -14855,9 +14861,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
@ -15482,14 +15489,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -15542,9 +15547,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
@ -15569,9 +15575,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
@ -17215,16 +17222,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
@ -17269,16 +17273,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
@ -17365,9 +17366,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -17408,9 +17410,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -18567,15 +18570,13 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
@ -18620,15 +18621,13 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
@ -18714,9 +18713,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@ -18757,9 +18757,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2

View File

@ -382,9 +382,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@ -409,9 +410,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@ -836,9 +838,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@ -863,9 +866,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@ -1936,9 +1940,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@ -1963,9 +1968,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@ -2390,9 +2396,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@ -2417,9 +2424,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@ -14767,15 +14775,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -14828,9 +14833,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
@ -14855,9 +14861,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
@ -15482,14 +15489,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -15542,9 +15547,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
@ -15569,9 +15575,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
@ -17215,16 +17222,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
@ -17269,16 +17273,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
@ -17365,9 +17366,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -17408,9 +17410,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -18567,15 +18570,13 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
@ -18620,15 +18621,13 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
@ -18714,9 +18713,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@ -18757,9 +18757,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2

View File

@ -475,14 +475,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -532,9 +529,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
@ -557,9 +555,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
@ -1068,13 +1067,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -1123,9 +1120,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
@ -1148,9 +1146,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
@ -2083,14 +2082,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -2140,9 +2136,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
@ -2165,9 +2162,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
@ -2676,13 +2674,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -2731,9 +2727,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
@ -2756,9 +2753,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
@ -14301,14 +14299,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -14358,9 +14353,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
@ -14383,9 +14379,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start
@ -14960,13 +14957,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -15015,9 +15010,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
@ -15040,9 +15036,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
@ -16644,16 +16641,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
@ -16698,16 +16692,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
@ -16794,9 +16785,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -16837,9 +16829,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@ -17996,15 +17989,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
@ -18049,15 +18040,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
@ -18143,9 +18132,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@ -18186,9 +18176,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX908-NEXT: s_mov_b64 s[4:5], vcc
; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2

View File

@ -63,8 +63,8 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABEL: name: clear_subreg_imm_fold
; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967288
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295
; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 -8
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 -1
; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
%0:sreg_64 = S_MOV_B64 -8
%1:sgpr_32 = COPY %0.sub0

View File

@ -265,7 +265,7 @@ body: |
; GCN-LABEL: name: fmac_sreg_64_sub0_src0_to_fmamk
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 2882399984, [[DEF1]], implicit $mode, implicit $exec
; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], -1412567312, [[DEF1]], implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
@ -319,7 +319,7 @@ body: |
; GCN-LABEL: name: fma_sreg_64_sub0_to_fmaak
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], 2882399984, implicit $mode, implicit $exec
; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], -1412567312, implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF