AMDGPU: Start to use AV classes for unknown vector class (#166482)
AMDGPU: Start to use AV classes for unknown vector class Use AGPR+VGPR superclasses for gfx90a+. The type used for the class should be the broadest possible class, to be contextually restricted later. InstrEmitter clamps these to the common subclass of the context use instructions, so we're best off using the broadest possible class for all types. Note this does very little because we only use VGPR classes for FP types (though this doesn't particularly make any sense), and we legalize normal loads and stores to integer.
This commit is contained in:
parent
d1cc1376a0
commit
7ff4cd4da8
@ -91,64 +91,73 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
|
||||
|
||||
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
|
||||
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
|
||||
|
||||
const SIRegisterInfo *TRI = STI.getRegisterInfo();
|
||||
const TargetRegisterClass *V32RegClass =
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(32);
|
||||
addRegisterClass(MVT::f32, V32RegClass);
|
||||
|
||||
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
|
||||
|
||||
const SIRegisterInfo *TRI = STI.getRegisterInfo();
|
||||
const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
|
||||
const TargetRegisterClass *V64RegClass =
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(64);
|
||||
|
||||
addRegisterClass(MVT::f64, V64RegClass);
|
||||
addRegisterClass(MVT::v2f32, V64RegClass);
|
||||
addRegisterClass(MVT::Untyped, V64RegClass);
|
||||
|
||||
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
|
||||
addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
|
||||
addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
|
||||
|
||||
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
|
||||
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
|
||||
|
||||
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
|
||||
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
|
||||
addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
|
||||
|
||||
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
|
||||
addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
|
||||
addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
|
||||
|
||||
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
|
||||
addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
|
||||
addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
|
||||
|
||||
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
|
||||
addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
|
||||
addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
|
||||
|
||||
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
|
||||
addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
|
||||
addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
|
||||
|
||||
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
|
||||
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
|
||||
addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
|
||||
|
||||
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
|
||||
addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
|
||||
addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
|
||||
|
||||
addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
|
||||
addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
|
||||
addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
|
||||
|
||||
addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
|
||||
addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
|
||||
addRegisterClass(MVT::v10f32,
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(320));
|
||||
|
||||
addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
|
||||
addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
|
||||
addRegisterClass(MVT::v11f32,
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(352));
|
||||
|
||||
addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
|
||||
addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
|
||||
addRegisterClass(MVT::v12f32,
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(384));
|
||||
|
||||
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
|
||||
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
|
||||
addRegisterClass(MVT::v16f32,
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(512));
|
||||
|
||||
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
|
||||
addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
|
||||
addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
|
||||
|
||||
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
|
||||
addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
|
||||
addRegisterClass(MVT::v16f64,
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(1024));
|
||||
|
||||
if (Subtarget->has16BitInsts()) {
|
||||
if (Subtarget->useRealTrue16Insts()) {
|
||||
@ -180,7 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
}
|
||||
|
||||
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
|
||||
addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
|
||||
addRegisterClass(MVT::v32f32,
|
||||
TRI->getDefaultVectorSuperClassForBitWidth(1024));
|
||||
|
||||
computeRegisterProperties(Subtarget->getRegisterInfo());
|
||||
|
||||
|
||||
@ -3557,6 +3557,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
|
||||
: getAnyVectorSuperClassForBitWidth(BitWidth);
|
||||
}
|
||||
|
||||
const TargetRegisterClass *
|
||||
SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const {
|
||||
// TODO: In principle this should use AV classes for gfx908 too. This is
|
||||
// limited to 90a+ to avoid regressing special case copy optimizations which
|
||||
// need new handling. The core issue is that it's not possible to directly
|
||||
// copy between AGPRs on gfx908, and the current optimizations around that
|
||||
// expect to see copies to VGPR.
|
||||
return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
|
||||
: getVGPRClassForBitWidth(BitWidth);
|
||||
}
|
||||
|
||||
const TargetRegisterClass *
|
||||
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
|
||||
if (BitWidth == 16 || BitWidth == 32)
|
||||
|
||||
@ -215,6 +215,10 @@ public:
|
||||
const TargetRegisterClass *
|
||||
getVectorSuperClassForBitWidth(unsigned BitWidth) const;
|
||||
|
||||
LLVM_READONLY
|
||||
const TargetRegisterClass *
|
||||
getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const;
|
||||
|
||||
LLVM_READONLY
|
||||
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -381,17 +381,17 @@ define float @no_unsafe(ptr %addr, float %val) {
|
||||
; GFX90A-LABEL: no_unsafe:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -836,7 +836,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX90A-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4
|
||||
; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5]
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v2, s[4:5]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: s_branch .LBB5_7
|
||||
; GFX90A-NEXT: .LBB5_6: ; %Flow
|
||||
@ -846,7 +846,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX90A-NEXT: .LBB5_7: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
|
||||
; GFX90A-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
|
||||
@ -856,6 +855,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB5_7
|
||||
; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end1
|
||||
@ -926,7 +926,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX942-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12
|
||||
; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1]
|
||||
; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: s_branch .LBB5_7
|
||||
; GFX942-NEXT: .LBB5_6: ; %Flow
|
||||
@ -936,7 +936,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX942-NEXT: .LBB5_7: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
||||
; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
|
||||
; GFX942-NEXT: s_getpc_b64 s[2:3]
|
||||
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
|
||||
@ -946,6 +945,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5]
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB5_7
|
||||
; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end1
|
||||
|
||||
@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
|
||||
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0
|
||||
; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v20, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
@ -16,12 +16,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: s_bitcmp1_b32 s0, 8
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
|
||||
; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
|
||||
; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3]
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90
|
||||
@ -37,8 +35,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883
|
||||
; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4
|
||||
; CHECK-NEXT: s_mov_b64 s[22:23], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136
|
||||
; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036
|
||||
; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136
|
||||
; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523
|
||||
; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555
|
||||
@ -64,10 +62,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
|
||||
; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25]
|
||||
; CHECK-NEXT: v_accvgpr_read_b32 v27, a3
|
||||
; CHECK-NEXT: v_accvgpr_read_b32 v26, a2
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1]
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a0, 0
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a1, 0
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
@ -85,9 +81,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15]
|
||||
; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
|
||||
; CHECK-NEXT: v_fmac_f64_e32 v[16:17], 0, v[28:29]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19]
|
||||
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[16:17]
|
||||
; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19]
|
||||
; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
|
||||
; CHECK-NEXT: s_branch .LBB0_6
|
||||
; CHECK-NEXT: .LBB0_5: ; %Flow
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
|
||||
@ -96,8 +93,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329
|
||||
; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1
|
||||
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: v_accvgpr_read_b32 v29, a1
|
||||
; CHECK-NEXT: v_accvgpr_read_b32 v28, a0
|
||||
; CHECK-NEXT: v_accvgpr_read_b32 v27, a1
|
||||
; CHECK-NEXT: v_accvgpr_read_b32 v26, a0
|
||||
; CHECK-NEXT: s_mov_b64 s[24:25], -1
|
||||
; CHECK-NEXT: s_mov_b64 s[8:9], -1
|
||||
; CHECK-NEXT: s_mov_b64 vcc, s[2:3]
|
||||
@ -112,9 +109,9 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB0_5
|
||||
; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a0, v26
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a0, v28
|
||||
; CHECK-NEXT: s_mov_b64 s[24:25], 0
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a1, v27
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a1, v29
|
||||
; CHECK-NEXT: s_mov_b64 s[8:9], 0
|
||||
; CHECK-NEXT: s_branch .LBB0_5
|
||||
; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
|
||||
@ -132,13 +129,13 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
|
||||
; CHECK-NEXT: s_cbranch_vccz .LBB0_13
|
||||
; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
|
||||
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29]
|
||||
; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27]
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17]
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17]
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v17, v16
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v27, v26
|
||||
; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17]
|
||||
; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13]
|
||||
; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13]
|
||||
; CHECK-NEXT: s_cselect_b32 s23, s23, 0
|
||||
; CHECK-NEXT: s_cselect_b32 s22, s22, 0
|
||||
; CHECK-NEXT: s_mov_b64 s[8:9], -1
|
||||
|
||||
@ -18,7 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
|
||||
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
|
||||
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -53,7 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
|
||||
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
|
||||
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -89,7 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
|
||||
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
|
||||
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -127,7 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
|
||||
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -170,7 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
|
||||
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -217,7 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -265,7 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
@ -315,7 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
|
||||
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
|
||||
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
|
||||
;
|
||||
|
||||
@ -108,7 +108,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
|
||||
@ -136,7 +136,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
|
||||
@ -164,7 +164,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
|
||||
@ -194,7 +194,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
|
||||
@ -340,7 +340,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
|
||||
@ -374,7 +374,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
|
||||
@ -408,7 +408,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
|
||||
@ -444,7 +444,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
|
||||
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
|
||||
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
|
||||
; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
|
||||
|
||||
@ -421,19 +421,19 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s6
|
||||
; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v0
|
||||
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB18_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -458,19 +458,19 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s6
|
||||
; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v0
|
||||
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB19_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
|
||||
@ -4448,18 +4448,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig
|
||||
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4771,18 +4771,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -5462,18 +5462,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
|
||||
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
|
||||
@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
|
||||
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX942-NEXT: s_movk_i32 s0, 0xf800
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
|
||||
; GFX942-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX942-NEXT: flat_load_dword v3, v[4:5]
|
||||
; GFX942-NEXT: s_mov_b32 s1, -1
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v0, v1
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
|
||||
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -950,24 +948,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -997,25 +995,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
|
||||
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX942-NEXT: s_movk_i32 s0, 0xf800
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
|
||||
; GFX942-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX942-NEXT: flat_load_dword v3, v[4:5]
|
||||
; GFX942-NEXT: s_mov_b32 s1, -1
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v0, v1
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
|
||||
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -5415,54 +5411,50 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
||||
; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_3
|
||||
; GFX90A-NEXT: ; %bb.1: ; %Flow2
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_6
|
||||
; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB24_4
|
||||
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
|
||||
; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_4
|
||||
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
|
||||
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
||||
; GFX90A-NEXT: .LBB24_4: ; %Flow2
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB24_2
|
||||
; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB24_6
|
||||
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7]
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
|
||||
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX942-NEXT: s_movk_i32 s0, 0xf800
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
|
||||
; GFX942-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX942-NEXT: flat_load_dword v3, v[4:5]
|
||||
; GFX942-NEXT: s_mov_b32 s1, -1
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v0, v1
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
|
||||
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
||||
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -950,24 +948,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -997,25 +995,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
|
||||
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX942-NEXT: s_movk_i32 s0, 0xf800
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
|
||||
; GFX942-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX942-NEXT: flat_load_dword v3, v[4:5]
|
||||
; GFX942-NEXT: s_mov_b32 s1, -1
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v0, v1
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
|
||||
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
||||
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -5415,54 +5411,50 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
|
||||
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
||||
; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_3
|
||||
; GFX90A-NEXT: ; %bb.1: ; %Flow2
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_6
|
||||
; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB24_4
|
||||
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
|
||||
; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
|
||||
; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_4
|
||||
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
|
||||
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
||||
; GFX90A-NEXT: .LBB24_4: ; %Flow2
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB24_2
|
||||
; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB24_6
|
||||
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
||||
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7]
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@ -49,12 +49,11 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
|
||||
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
@ -62,6 +61,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -122,18 +122,18 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -245,12 +245,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
|
||||
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
|
||||
@ -258,6 +257,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -319,18 +319,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -445,27 +445,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
|
||||
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX942-NEXT: s_movk_i32 s0, 0xf800
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
|
||||
; GFX942-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX942-NEXT: flat_load_dword v7, v[4:5]
|
||||
; GFX942-NEXT: s_mov_b32 s1, -1
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2
|
||||
; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v7, v0
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -533,18 +531,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
|
||||
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1254,12 +1252,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
|
||||
; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
|
||||
@ -1267,6 +1264,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1328,12 +1326,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
|
||||
; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
@ -1342,6 +1339,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1656,12 +1654,11 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
|
||||
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX942-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
|
||||
@ -1669,6 +1666,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1729,18 +1727,18 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1852,12 +1850,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
|
||||
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
|
||||
@ -1865,6 +1862,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1926,18 +1924,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2052,27 +2050,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
|
||||
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
|
||||
; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX942-NEXT: s_movk_i32 s0, 0xf800
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
|
||||
; GFX942-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX942-NEXT: flat_load_dword v7, v[4:5]
|
||||
; GFX942-NEXT: s_mov_b32 s1, -1
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2
|
||||
; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v7, v0
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2140,18 +2136,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
|
||||
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2861,12 +2857,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
|
||||
; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
|
||||
@ -2874,6 +2869,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2935,12 +2931,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
|
||||
; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
|
||||
@ -2949,6 +2944,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -3295,12 +3291,11 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
|
||||
; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execz .LBB16_4
|
||||
; GFX942-NEXT: ; %bb.1: ; %atomicrmw.global
|
||||
; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
||||
; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
||||
; GFX942-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX942-NEXT: .LBB16_2: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
|
||||
@ -3308,6 +3303,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB16_2
|
||||
; GFX942-NEXT: ; %bb.3: ; %Flow
|
||||
@ -3433,51 +3429,47 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_3
|
||||
; GFX90A-NEXT: ; %bb.1: ; %Flow3
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_6
|
||||
; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB16_4
|
||||
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
||||
; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_4
|
||||
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_2
|
||||
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: .LBB16_4: ; %Flow3
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB16_2
|
||||
; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB16_6
|
||||
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
|
||||
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3]
|
||||
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -3713,8 +3705,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3]
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
||||
@ -3726,31 +3718,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX942-NEXT: .LBB17_3: ; %atomicrmw.global
|
||||
; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9]
|
||||
; GFX942-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX942-NEXT: .LBB17_4: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
|
||||
; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
|
||||
; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
||||
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB17_4
|
||||
; GFX942-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execz .LBB17_2
|
||||
; GFX942-NEXT: .LBB17_6: ; %atomicrmw.private
|
||||
; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
||||
; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc
|
||||
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
|
||||
@ -3870,10 +3862,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0x7f8, v0
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
@ -3885,29 +3877,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9]
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
||||
; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
||||
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB17_4
|
||||
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB17_2
|
||||
; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -4160,8 +4152,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
|
||||
; GFX942-NEXT: s_movk_i32 s2, 0xf800
|
||||
; GFX942-NEXT: s_mov_b32 s3, -1
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3]
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
||||
@ -4173,31 +4165,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global
|
||||
; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9]
|
||||
; GFX942-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX942-NEXT: .LBB18_4: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
|
||||
; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
|
||||
; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0
|
||||
; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
||||
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
|
||||
; GFX942-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execz .LBB18_2
|
||||
; GFX942-NEXT: .LBB18_6: ; %atomicrmw.private
|
||||
; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
||||
; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc
|
||||
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3]
|
||||
@ -4317,10 +4309,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
|
||||
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
|
||||
; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
@ -4332,29 +4324,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9]
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
||||
; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
||||
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
|
||||
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execz .LBB18_2
|
||||
; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
||||
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
|
||||
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc
|
||||
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
||||
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
@ -12687,20 +12687,19 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) {
|
||||
; GFX950-SDAG: ; %bb.0:
|
||||
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
||||
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
|
||||
; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
|
||||
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0
|
||||
; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start
|
||||
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1
|
||||
; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4
|
||||
; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
||||
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
|
||||
; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1
|
||||
; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -12812,20 +12811,19 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) {
|
||||
; GFX950-SDAG: ; %bb.0:
|
||||
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
||||
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
|
||||
; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
|
||||
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0
|
||||
; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start
|
||||
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5
|
||||
; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1
|
||||
; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
|
||||
; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1
|
||||
; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4
|
||||
; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
||||
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
|
||||
; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1
|
||||
; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
|
||||
@ -46,7 +46,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
|
||||
; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
|
||||
; GFX90A-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
|
||||
; GFX90A-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GFX90A-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF
|
||||
; GFX90A-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX90A-NEXT: S_BRANCH %bb.1
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
@ -76,12 +76,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
|
||||
; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
|
||||
; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec
|
||||
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]]
|
||||
; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
|
||||
; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
|
||||
; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
|
||||
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
|
||||
; GFX90A-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
|
||||
; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF
|
||||
; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX90A-NEXT: S_BRANCH %bb.2
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
@ -89,28 +90,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
|
||||
; GFX90A-NEXT: successors: %bb.4(0x80000000)
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2
|
||||
; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
|
||||
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2
|
||||
; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
|
||||
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
|
||||
; GFX90A-NEXT: S_BRANCH %bb.4
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.3.Flow:
|
||||
; GFX90A-NEXT: successors: %bb.5(0x80000000)
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
|
||||
; GFX90A-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4
|
||||
; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX90A-NEXT: S_BRANCH %bb.5
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.4 (%ir-block.35):
|
||||
; GFX90A-NEXT: successors: %bb.3(0x80000000)
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
|
||||
; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2
|
||||
; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
|
||||
; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
|
||||
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
|
||||
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
|
||||
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
|
||||
; GFX90A-NEXT: early-clobber %48:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %48, 0, 0, implicit $mode, implicit $exec
|
||||
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]]
|
||||
; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
|
||||
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec
|
||||
; GFX90A-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]]
|
||||
; GFX90A-NEXT: S_BRANCH %bb.3
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.5 (%ir-block.41):
|
||||
@ -128,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
|
||||
; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
|
||||
; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
|
||||
; GFX942-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GFX942-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF
|
||||
; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX942-NEXT: S_BRANCH %bb.1
|
||||
; GFX942-NEXT: {{ $}}
|
||||
@ -158,12 +161,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
|
||||
; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
|
||||
; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX942-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec
|
||||
; GFX942-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]]
|
||||
; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63
|
||||
; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
|
||||
; GFX942-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
|
||||
; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
|
||||
; GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
|
||||
; GFX942-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
|
||||
; GFX942-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF
|
||||
; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX942-NEXT: S_BRANCH %bb.2
|
||||
; GFX942-NEXT: {{ $}}
|
||||
@ -171,28 +175,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
|
||||
; GFX942-NEXT: successors: %bb.4(0x80000000)
|
||||
; GFX942-NEXT: {{ $}}
|
||||
; GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2
|
||||
; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
|
||||
; GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2
|
||||
; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
|
||||
; GFX942-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
|
||||
; GFX942-NEXT: S_BRANCH %bb.4
|
||||
; GFX942-NEXT: {{ $}}
|
||||
; GFX942-NEXT: bb.3.Flow:
|
||||
; GFX942-NEXT: successors: %bb.5(0x80000000)
|
||||
; GFX942-NEXT: {{ $}}
|
||||
; GFX942-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
|
||||
; GFX942-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4
|
||||
; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX942-NEXT: S_BRANCH %bb.5
|
||||
; GFX942-NEXT: {{ $}}
|
||||
; GFX942-NEXT: bb.4 (%ir-block.35):
|
||||
; GFX942-NEXT: successors: %bb.3(0x80000000)
|
||||
; GFX942-NEXT: {{ $}}
|
||||
; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
|
||||
; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2
|
||||
; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
|
||||
; GFX942-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
|
||||
; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
|
||||
; GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
|
||||
; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
|
||||
; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
|
||||
; GFX942-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec
|
||||
; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec
|
||||
; GFX942-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]]
|
||||
; GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
|
||||
; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec
|
||||
; GFX942-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]]
|
||||
; GFX942-NEXT: S_BRANCH %bb.3
|
||||
; GFX942-NEXT: {{ $}}
|
||||
; GFX942-NEXT: bb.5 (%ir-block.41):
|
||||
|
||||
@ -38,33 +38,35 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %
|
||||
; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
|
||||
; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1)
|
||||
; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]]
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.1.atomicrmw.start:
|
||||
; GFX90A-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1
|
||||
; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, %3, %bb.1
|
||||
; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY7]], %bb.0, %3, %bb.1
|
||||
; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1
|
||||
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0
|
||||
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
|
||||
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
|
||||
; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
|
||||
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]]
|
||||
; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1)
|
||||
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1
|
||||
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0
|
||||
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
|
||||
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
|
||||
; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3
|
||||
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]]
|
||||
; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY12]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1)
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec
|
||||
; GFX90A-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
|
||||
; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX90A-NEXT: S_BRANCH %bb.2
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.2.atomicrmw.end:
|
||||
; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], %bb.1
|
||||
; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY13]], %bb.1
|
||||
; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1
|
||||
; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
|
||||
; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
|
||||
; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
|
||||
; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
|
||||
; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
|
||||
; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
|
||||
; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
|
||||
|
||||
@ -90,18 +90,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -287,18 +287,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -486,18 +486,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1283,12 +1283,11 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
@ -1297,6 +1296,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1697,18 +1697,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1912,18 +1912,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2319,18 +2319,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2719,18 +2719,18 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -3712,18 +3712,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4130,18 +4130,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4532,18 +4532,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -6724,18 +6724,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr
|
||||
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
|
||||
@ -35,24 +35,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -80,23 +80,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -197,24 +197,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -242,23 +242,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -361,24 +361,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
@ -406,23 +406,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
|
||||
; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
|
||||
|
||||
@ -35,24 +35,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -80,23 +80,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
|
||||
@ -197,24 +197,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -242,23 +242,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -361,24 +361,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
@ -406,23 +406,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
|
||||
@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
|
||||
@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
|
||||
@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
|
||||
@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX942-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX942-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
||||
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
|
||||
; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2
|
||||
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
||||
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
|
||||
@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
|
||||
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
|
||||
; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
|
||||
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
|
||||
; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
|
||||
|
||||
@ -50,12 +50,11 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f32:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
@ -63,6 +62,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -123,18 +123,18 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -282,12 +282,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
|
||||
@ -295,6 +294,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -355,18 +355,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -516,12 +516,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
|
||||
@ -529,6 +528,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -589,18 +589,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1438,12 +1438,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
|
||||
; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
|
||||
@ -1451,6 +1450,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1511,12 +1511,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
|
||||
; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
@ -1525,6 +1524,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1905,12 +1905,11 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
|
||||
@ -1918,6 +1917,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -1978,18 +1978,18 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2137,12 +2137,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
|
||||
@ -2150,6 +2149,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2210,18 +2210,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2371,12 +2371,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
|
||||
@ -2384,6 +2383,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -2444,18 +2444,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -3293,12 +3293,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
|
||||
; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
|
||||
@ -3306,6 +3305,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
|
||||
; GFX942-NEXT: buffer_inv sc0 sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -3366,12 +3366,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
|
||||
; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
|
||||
@ -3380,6 +3379,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -3760,12 +3760,11 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f64:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
|
||||
; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
|
||||
@ -3773,6 +3772,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -3836,18 +3836,18 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4012,12 +4012,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
|
||||
; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0
|
||||
@ -4025,6 +4024,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4088,18 +4088,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4265,12 +4265,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
|
||||
; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX942-NEXT: buffer_wbl2 sc1
|
||||
; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0
|
||||
@ -4278,6 +4277,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
|
||||
; GFX942-NEXT: buffer_inv sc1
|
||||
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
@ -4341,18 +4341,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
|
||||
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
|
||||
@ -1376,47 +1376,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
|
||||
;
|
||||
; GFX90A-LABEL: test_mfma_loop_sgpr_init:
|
||||
; GFX90A: ; %bb.0: ; %entry
|
||||
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
|
||||
; GFX90A-NEXT: s_mov_b32 s0, 16
|
||||
; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a28, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a27, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a26, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a25, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a24, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a23, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a22, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a21, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a20, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a19, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a18, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a17, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a14, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a13, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a12, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a11, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a10, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a9, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a8, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GFX90A-NEXT: s_mov_b32 s0, 16
|
||||
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_nop 1
|
||||
; GFX90A-NEXT: s_nop 0
|
||||
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
|
||||
; GFX90A-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
|
||||
@ -1438,47 +1437,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
|
||||
;
|
||||
; GFX942-LABEL: test_mfma_loop_sgpr_init:
|
||||
; GFX942: ; %bb.0: ; %entry
|
||||
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
|
||||
; GFX942-NEXT: s_mov_b32 s0, 16
|
||||
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a31, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a30, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a29, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a28, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a27, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a26, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a25, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a24, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a23, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a22, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a21, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a20, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a19, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a18, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a17, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a15, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a14, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a13, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a12, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a11, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a10, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a9, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a8, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a7, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a6, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a5, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a4, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a3, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a2, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a1, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
|
||||
; GFX942-NEXT: s_mov_b32 s0, 16
|
||||
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
|
||||
; GFX942-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
|
||||
@ -1643,13 +1641,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
;
|
||||
; GFX90A-LABEL: test_mfma_loop_mixed_init:
|
||||
; GFX90A: ; %bb.0: ; %entry
|
||||
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
|
||||
; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c
|
||||
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
|
||||
@ -1679,7 +1677,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
|
||||
; GFX90A-NEXT: s_mov_b32 s0, 16
|
||||
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
|
||||
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
|
||||
@ -1706,13 +1703,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
;
|
||||
; GFX942-LABEL: test_mfma_loop_mixed_init:
|
||||
; GFX942: ; %bb.0: ; %entry
|
||||
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
|
||||
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c
|
||||
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a1, s0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
|
||||
@ -1742,7 +1739,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
|
||||
; GFX942-NEXT: s_mov_b32 s0, 16
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
|
||||
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
|
||||
@ -2401,7 +2397,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
|
||||
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
|
||||
; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader
|
||||
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
|
||||
; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2
|
||||
@ -2471,7 +2466,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
|
||||
; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
|
||||
; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader
|
||||
; GFX942-NEXT: ; =>This Loop Header: Depth=1
|
||||
; GFX942-NEXT: ; Child Loop BB9_2 Depth 2
|
||||
|
||||
@ -6,15 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
|
||||
; GFX942-LABEL: matmul_kernel:
|
||||
; GFX942: ; %bb.0: ; %entry
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_mov_b32 s2, 0
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
|
||||
; GFX942-NEXT: s_mov_b32 s3, 0
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
|
||||
; GFX942-NEXT: s_branch .LBB0_2
|
||||
; GFX942-NEXT: .LBB0_1: ; %bb2
|
||||
; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
|
||||
@ -22,12 +22,14 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
|
||||
; GFX942-NEXT: s_ashr_i32 s5, s3, 31
|
||||
; GFX942-NEXT: s_mov_b32 s3, s2
|
||||
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
|
||||
; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2
|
||||
; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
|
||||
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1
|
||||
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
|
||||
; GFX942-NEXT: s_and_b32 s3, s5, s4
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
|
||||
; GFX942-NEXT: s_nop 6
|
||||
; GFX942-NEXT: v_accvgpr_read_b32 v0, a2
|
||||
; GFX942-NEXT: s_cbranch_execz .LBB0_4
|
||||
; GFX942-NEXT: .LBB0_2: ; %bb
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
@ -35,7 +37,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
|
||||
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
|
||||
; GFX942-NEXT: ; %bb.3:
|
||||
; GFX942-NEXT: ; implicit-def: $sgpr3
|
||||
; GFX942-NEXT: ; implicit-def: $agpr2
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX942-NEXT: .LBB0_4: ; %common.ret
|
||||
; GFX942-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user