There are three calls to bumpCycle in bumpNode. Prior to the first call, we calculate NextCycle as the next cycle in which all of a given instruction's required hardware resources (as defined by the SchedModel) are available. Any gap between this calculated NextCycle and CurrCycle measures stalls that must occur before we can schedule the given instruction. The second and third call handle adjustments that occur during or after issuing of the instruction (e.g. if the number of microops exceeds the issue width). According to the documentation of HazardRec->EmitInstruction, we should call this method when an instruction is emitted: "This callback is invoked when an instruction is emitted, to advance the hazard state." In the context of bumpNode, this implies that it should be called after we bumpCycle for stalls that must occur before issue of the instructions, but before those that occur during or after. This PR moves the placement to do that. In practice, this affects schedulers that use both the SchedModel and HazardRec. Suppose we have instructions A, B and C, and partial schedule AB. Also, suppose instruction A exclusively holds ProcResource X for 2 cycles, and B uses ProcResource X, and there is a HazardRec hazard between B and C which requires 1 cycle stall. Currently, we call HazardRec->EmitInstruction on B before we call HazardRec->AdvanceCycle for the stall between A->B. Then, when deciding whether to schedule C, HazardRec sees that a cycle has already occurred after B, so we do not need to stall. After this change, we HazardRec->EmitInstruction on B after we call HazardRec->AdvanceCycle for the stall between A->B. So, HazardRec accurately places the stall cycle between A and B. Then, when deciding whether to schedule C, HazardRec accurately sees that no cycles have occurred after B, so we do need to stall for 1 cycle.
21528 lines
915 KiB
LLVM
21528 lines
915 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX950 %s
|
|
|
|
;---------------------------------------------------------------------
|
|
; xchg i32 cases
|
|
;---------------------------------------------------------------------
|
|
|
|
; Input and result use AGPR
|
|
define void @flat_atomic_xchg_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AGPR, result used as VGPR.
|
|
define void @flat_atomic_xchg_i32_ret_a_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "v"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is VGPR, result used as AGPR
|
|
define void @flat_atomic_xchg_i32_ret_v_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=v"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, result also used as AV
|
|
define void @flat_atomic_xchg_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as v
|
|
define void @flat_atomic_xchg_i32_ret_av_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "v"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as a
|
|
define void @flat_atomic_xchg_i32_ret_av_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is a, result used as AV
|
|
define void @flat_atomic_xchg_i32_ret_a_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is v, result used as AV
|
|
define void @flat_atomic_xchg_i32_ret_v_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=v"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:31]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:31]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[0:31]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
|
|
; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:31]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
|
|
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
|
|
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i32_noret_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_noret_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v[0:1], a0 offset:40
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_noret_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v[0:1], a0 offset:40 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%unused = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i32_noret_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_noret_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 offset:40
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_noret_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap v[0:1], v2 offset:40 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%unused = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; xchg i64 cases
|
|
;---------------------------------------------------------------------
|
|
|
|
; Input and result use AGPR
|
|
define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB11_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB11_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB11_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword a2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword a3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a3
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB11_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB11_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB11_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off
|
|
; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AGPR, result used as VGPR.
|
|
define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB12_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB12_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB12_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB12_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB12_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB12_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB12_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off
|
|
; GFX950-NEXT: .LBB12_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "v"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is VGPR, result used as AGPR
|
|
define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB13_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB13_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB13_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB13_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB13_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB13_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
|
|
; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=v"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, result also used as AV
|
|
define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB14_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB14_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB14_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB14_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB14_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB14_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
|
|
; GFX950-NEXT: .LBB14_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as v
|
|
define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB15_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB15_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB15_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB15_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB15_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB15_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
|
|
; GFX950-NEXT: .LBB15_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "v"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as a
|
|
define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB16_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB16_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB16_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB16_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB16_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB16_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
|
|
; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is a, result used as AV
|
|
define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB17_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB17_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB17_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB17_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB17_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB17_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off
|
|
; GFX950-NEXT: .LBB17_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is v, result used as AV
|
|
define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB18_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB18_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB18_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB18_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB18_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB18_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
|
|
; GFX950-NEXT: .LBB18_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=v"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_noret_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB19_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB19_4
|
|
; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB19_2
|
|
; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_noret_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB19_3
|
|
; GFX950-NEXT: ; %bb.1: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB19_4
|
|
; GFX950-NEXT: .LBB19_2: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX950-NEXT: .LBB19_3: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1] sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB19_2
|
|
; GFX950-NEXT: .LBB19_4: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v0, a[0:1], off
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%unused = atomicrmw xchg ptr %ptr, i64 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_noret_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB20_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB20_4
|
|
; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB20_2
|
|
; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_noret_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB20_3
|
|
; GFX950-NEXT: ; %bb.1: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB20_4
|
|
; GFX950-NEXT: .LBB20_2: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX950-NEXT: .LBB20_3: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB20_2
|
|
; GFX950-NEXT: .LBB20_4: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%unused = atomicrmw xchg ptr %ptr, i64 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; xor i32 cases with cmpxchg expansion
|
|
;---------------------------------------------------------------------
|
|
|
|
; Input and result use AGPR
|
|
define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .LBB21_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AGPR, result used as VGPR.
|
|
define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB22_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "v"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is VGPR, result used as AGPR
|
|
define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB23_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, result also used as AV
|
|
define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB24_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as v
|
|
define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB25_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "v"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as a
|
|
define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB26_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is a, result used as AV
|
|
define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB27_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is v, result used as AV
|
|
define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB28_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:31]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33
|
|
; GFX90A-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a34
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34
|
|
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:31]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a32
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a33, v1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[0:31]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a34
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a2, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a3, v3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a4, v4
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a5, v5
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a6, v6
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a7, v7
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a8, v8
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a9, v9
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a10, v10
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a11, v11
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a12, v12
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a13, v13
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a14, v14
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a15, v15
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a16, v16
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a17, v17
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a18, v18
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a19, v19
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a20, v20
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a21, v21
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a22, v22
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a23, v23
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a24, v24
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a25, v25
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a26, v26
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a27, v27
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a28, v28
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a29, v29
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a30, v30
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a31, v31
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a32
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a33
|
|
; GFX950-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a34
|
|
; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB29_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a4
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a5
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a6
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a7
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v8, a8
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v9, a9
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v10, a10
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v11, a11
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v12, a12
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v13, a13
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v14, a14
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v15, a15
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v16, a16
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v17, a17
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v18, a18
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v19, a19
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v20, a20
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v21, a21
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v22, a22
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v23, a23
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v24, a24
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v25, a25
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v26, a26
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v27, a27
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v28, a28
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v29, a29
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v30, a30
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v31, a31
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:31]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a32
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
|
|
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
|
|
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
|
|
%result = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_noret_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_noret_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%unused = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_noret_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i32_noret_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%unused = atomicrmw xor ptr %ptr, i32 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; xor i64 cases with cmpxchg expansion
|
|
;---------------------------------------------------------------------
|
|
|
|
; Input and result use AGPR
|
|
define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB32_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB32_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB32_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB32_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v6
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v7
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB32_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB32_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB32_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB32_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB32_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AGPR, result used as VGPR.
|
|
define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB33_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB33_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB33_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB33_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB33_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB33_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB33_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB33_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB33_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB33_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB33_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "v"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is VGPR, result used as AGPR
|
|
define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_v_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB34_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB34_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB34_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB34_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v6
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v7
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_v_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB34_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB34_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB34_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB34_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB34_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, result also used as AV
|
|
define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB35_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB35_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB35_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB35_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB35_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB35_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB35_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB35_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB35_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB35_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB35_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as v
|
|
define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB36_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB36_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB36_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB36_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB36_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB36_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB36_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB36_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB36_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB36_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB36_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB36_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "v"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as a
|
|
define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB37_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB37_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB37_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB37_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB37_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v6
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v7
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB37_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB37_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB37_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB37_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB37_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is a, result used as AV
|
|
define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB38_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB38_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB38_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB38_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB38_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB38_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB38_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB38_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB38_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB38_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB38_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB38_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is v, result used as AV
|
|
define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_v_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB39_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB39_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB39_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB39_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_v_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB39_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB39_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB39_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB39_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB39_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB39_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB40_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB40_6
|
|
; GFX90A-NEXT: .LBB40_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB40_3: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB40_4: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB40_4
|
|
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
|
|
; GFX90A-NEXT: .LBB40_6: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB40_3
|
|
; GFX950-NEXT: ; %bb.1: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB40_6
|
|
; GFX950-NEXT: .LBB40_2: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX950-NEXT: .LBB40_3: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB40_4: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB40_4
|
|
; GFX950-NEXT: ; %bb.5: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB40_2
|
|
; GFX950-NEXT: .LBB40_6: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%unused = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB41_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB41_6
|
|
; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB41_3: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB41_4: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB41_4
|
|
; GFX90A-NEXT: ; %bb.5: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB41_2
|
|
; GFX90A-NEXT: .LBB41_6: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v7
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v6
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB41_3
|
|
; GFX950-NEXT: ; %bb.1: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB41_6
|
|
; GFX950-NEXT: .LBB41_2: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX950-NEXT: .LBB41_3: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB41_4: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
|
|
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc0 sc1
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB41_4
|
|
; GFX950-NEXT: ; %bb.5: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB41_2
|
|
; GFX950-NEXT: .LBB41_6: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v7
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6
|
|
; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%unused = atomicrmw xor ptr %ptr, i64 %data seq_cst
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; xor i32 cases with instruction
|
|
;---------------------------------------------------------------------
|
|
|
|
; Input and result use AGPR
|
|
define void @flat_atomic_xor_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AGPR, result used as VGPR.
|
|
define void @flat_atomic_xor_i32_ret_a_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_a_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "v"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is VGPR, result used as AGPR
|
|
define void @flat_atomic_xor_i32_ret_v_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_v_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_v_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, result also used as AV
|
|
define void @flat_atomic_xor_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as v
|
|
define void @flat_atomic_xor_i32_ret_av_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "v"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as a
|
|
define void @flat_atomic_xor_i32_ret_av_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is a, result used as AV
|
|
define void @flat_atomic_xor_i32_ret_a_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_a_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is v, result used as AV
|
|
define void @flat_atomic_xor_i32_ret_v_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_v_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_v_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:31]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:31]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[0:31]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
|
|
; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:31]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
|
|
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
|
|
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
|
|
%result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i32_noret_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_noret_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v[0:1], a0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_noret_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v[0:1], a0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%unused = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i32_noret_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_noret_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_noret_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor v[0:1], v2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%unused = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; xor i64 cases with instruction
|
|
;---------------------------------------------------------------------
|
|
|
|
; Input and result use AGPR
|
|
define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB53_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB53_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB53_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v4, v3
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB53_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB53_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB53_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AGPR, result used as VGPR.
|
|
define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB54_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB54_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB54_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB54_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_a_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB54_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB54_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB54_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
|
|
; GFX950-NEXT: .LBB54_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "v"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is VGPR, result used as AGPR
|
|
define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_v_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB55_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB55_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB55_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v4, v3
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_v_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB55_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB55_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB55_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, result also used as AV
|
|
define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB56_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB56_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB56_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB56_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB56_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB56_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB56_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
|
|
; GFX950-NEXT: .LBB56_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as v
|
|
define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_v:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB57_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB57_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB57_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB57_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_av_v:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB57_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB57_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB57_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
|
|
; GFX950-NEXT: .LBB57_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "v"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is AV, used as a
|
|
define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB58_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB58_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB58_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v2
|
|
; GFX90A-NEXT: v_xor_b32_e32 v3, v4, v3
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_av_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB58_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB58_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB58_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is a, result used as AV
|
|
define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB59_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB59_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB59_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB59_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_a_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB59_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB59_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB59_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
|
|
; GFX950-NEXT: .LBB59_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
; Input is v, result used as AV
|
|
define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_ret_v_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB60_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB60_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB60_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
|
|
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB60_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_ret_v_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB60_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB60_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB60_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
|
|
; GFX950-NEXT: .LBB60_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=v"()
|
|
%result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_noret_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB61_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB61_4
|
|
; GFX90A-NEXT: .LBB61_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB61_3: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB61_2
|
|
; GFX90A-NEXT: .LBB61_4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_noret_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB61_3
|
|
; GFX950-NEXT: ; %bb.1: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB61_4
|
|
; GFX950-NEXT: .LBB61_2: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX950-NEXT: .LBB61_3: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB61_2
|
|
; GFX950-NEXT: .LBB61_4: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v3
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%unused = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_noret_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB62_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB62_4
|
|
; GFX90A-NEXT: .LBB62_2: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX90A-NEXT: .LBB62_3: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB62_2
|
|
; GFX90A-NEXT: .LBB62_4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_noret_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB62_3
|
|
; GFX950-NEXT: ; %bb.1: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB62_4
|
|
; GFX950-NEXT: .LBB62_2: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
; GFX950-NEXT: .LBB62_3: ; %atomicrmw.global
|
|
; GFX950-NEXT: buffer_wbl2 sc1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: buffer_inv sc1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB62_2
|
|
; GFX950-NEXT: .LBB62_4: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v1, v3
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%unused = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics i32, with aa+av cases
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_add_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v2, v2
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB69_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v2, v2
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB70_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
|
|
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
|
|
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics i64, with aa+av cases
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB89_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB89_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB89_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB89_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB89_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB89_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB90_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB90_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB90_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB90_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB90_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB90_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB90_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB91_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB91_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB91_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB91_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB91_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB91_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB92_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB92_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB92_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB92_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB92_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB92_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB92_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB93_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB93_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB93_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_and_b32_e32 v1, v1, v2
|
|
; GFX90A-NEXT: v_and_b32_e32 v3, v4, v3
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB93_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB93_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB93_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB94_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB94_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB94_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v2, v0, v2
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB94_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB94_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB94_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB94_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB94_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB95_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB95_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v7
|
|
; GFX90A-NEXT: v_and_b32_e32 v8, v2, v6
|
|
; GFX90A-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX90A-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB95_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB95_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB95_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v1, v1, v6
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_and_b32_e32 v2, v2, v7
|
|
; GFX90A-NEXT: v_not_b32_e32 v1, v1
|
|
; GFX90A-NEXT: v_not_b32_e32 v2, v2
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB95_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB95_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, v3, v7
|
|
; GFX950-NEXT: v_and_b32_e32 v8, v2, v6
|
|
; GFX950-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX950-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB95_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB95_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB95_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v1, v7
|
|
; GFX950-NEXT: v_and_b32_e32 v5, v0, v6
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_not_b32_e32 v3, v2
|
|
; GFX950-NEXT: v_not_b32_e32 v2, v5
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB96_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB96_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
|
|
; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX90A-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB96_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB96_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB96_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v4, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v3, v3
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB96_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB96_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB96_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
|
|
; GFX950-NEXT: v_and_b32_e32 v8, v2, v4
|
|
; GFX950-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX950-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB96_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB96_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB96_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
|
|
; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
|
|
; GFX950-NEXT: v_not_b32_e32 v3, v2
|
|
; GFX950-NEXT: v_not_b32_e32 v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB96_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB97_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB97_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB97_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_or_b32_e32 v1, v1, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v3, v4, v3
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB97_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB97_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB97_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB98_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB98_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB98_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3
|
|
; GFX90A-NEXT: v_or_b32_e32 v2, v0, v2
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB98_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB98_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB98_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB98_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB98_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB99_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB99_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB99_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB99_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB99_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB99_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB100_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB100_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB100_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB100_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB100_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB100_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB100_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB100_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB101_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB101_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB101_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB101_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB101_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB101_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB102_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB102_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB102_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB102_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB102_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB102_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB102_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB102_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB103_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB103_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB103_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB103_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB103_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB103_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB104_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB104_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB104_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB104_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB104_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB104_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB104_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB104_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB105_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB105_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB105_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB105_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB105_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB105_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
|
|
; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB106_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB106_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB106_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB106_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB106_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB106_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB106_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB106_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB107_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB107_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB107_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
|
|
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB107_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB107_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB107_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
|
|
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB108_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB108_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB108_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB108_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB108_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB108_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB108_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
|
|
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB108_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB109_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB109_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB109_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
|
|
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB109_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB109_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB109_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
|
|
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
|
|
; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB110_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB110_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB110_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB110_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB110_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB110_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB110_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
|
|
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB110_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB111_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB111_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB111_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB111_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB111_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB111_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB111_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB111_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB111_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB112_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB112_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB112_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB112_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB112_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB112_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB112_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB112_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB112_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB112_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB112_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB112_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB113_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB113_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB113_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB113_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB113_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB113_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB113_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB113_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB113_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB113_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB114_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB114_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB114_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB114_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB114_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB114_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB114_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB114_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB114_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB114_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics f32, with aa+av cases
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB115_6
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB115_3
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB115_3: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB115_5
|
|
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB115_5: ; %Flow1
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: .LBB115_6: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB115_8
|
|
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v3
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB116_6
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB116_3
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr3
|
|
; GFX90A-NEXT: .LBB116_3: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB116_5
|
|
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB116_5: ; %Flow1
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr3
|
|
; GFX90A-NEXT: .LBB116_6: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB116_8
|
|
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: .LBB116_8: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX950-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX950-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
|
|
; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
|
|
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f32_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f32_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics f64, with aa+av cases
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB127_6
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB127_3
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB127_3: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB127_5
|
|
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB127_5: ; %Flow1
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB127_6: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB127_8
|
|
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB127_6
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB127_3
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB127_3: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB127_5
|
|
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB127_5: ; %Flow1
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB127_6: ; %Flow2
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB127_8
|
|
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
|
; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB128_6
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB128_3
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB128_3: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB128_5
|
|
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB128_5: ; %Flow1
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB128_6: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB128_8
|
|
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: .LBB128_8: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB128_6
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB128_3
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB128_3: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB128_5
|
|
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB128_5: ; %Flow1
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB128_6: ; %Flow2
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB128_8
|
|
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
|
|
; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB129_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB129_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB129_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB129_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB129_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB129_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB129_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB129_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB129_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB129_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[6:7]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB130_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB130_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: .LBB130_4: ; %Flow3
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB130_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB130_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[6:7]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB130_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB130_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: .LBB130_4: ; %Flow3
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB130_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
|
|
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
|
|
; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB131_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB131_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB131_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB131_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB131_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB131_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB132_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB132_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB132_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB132_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB132_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB132_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB132_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB132_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB133_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB133_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB133_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB133_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB133_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB133_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB134_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB134_2: ; %Flow
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB134_4
|
|
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB134_4: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB134_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB134_2: ; %Flow
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB134_4
|
|
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB134_4: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB135_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB135_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB135_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB135_4: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB135_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB135_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB135_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB135_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB135_4: ; %Flow2
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB135_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB136_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB136_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB136_4: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB136_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB136_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB136_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB136_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB136_4: ; %Flow2
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB136_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB136_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB137_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB137_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB137_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB137_4: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB137_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f64_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB137_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB137_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB137_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB137_4: ; %Flow2
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB137_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB138_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB138_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX90A-NEXT: .LBB138_4: ; %Flow2
|
|
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB138_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB138_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f64_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
|
|
; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
|
|
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB138_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB138_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
|
|
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
|
|
; GFX950-NEXT: .LBB138_4: ; %Flow2
|
|
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execz .LBB138_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
|
|
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
|
|
; GFX950-NEXT: .LBB138_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics v2f16, with aa+av cases
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB139_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB139_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2f16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_v2f16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB140_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB140_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2f16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: .LBB141_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB141_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB142_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB142_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB143_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB143_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB144_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB144_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB145_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB145_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB146_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB146_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: .LBB147_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB147_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB148_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB148_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: .LBB149_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB149_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB150_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB150_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics v2bf16, with aa+av cases
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB151_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2bf16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_v2bf16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB152_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2bf16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB153_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB153_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB154_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB154_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB155_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB155_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB156_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB156_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB157_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB157_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB158_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
|
|
; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB158_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB159_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB159_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4
|
|
; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB160_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4
|
|
; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB160_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB161_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB161_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4
|
|
; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
|
|
; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB162_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
|
|
; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4
|
|
; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
|
|
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
|
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX950-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB162_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics i32, with aa+av cases using saddr
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xchg_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB171_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v0, v0
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_bitop3_b32 v0, v1, v4, v1 bitop3:0x3f
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v0, v0
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB172_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_bitop3_b32 v0, v1, v4, v1 bitop3:0x3f
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB172_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s16
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s17
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v2
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB189_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .LBB189_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e32 v0, v1, v4
|
|
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB190_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e32 v0, v1, v4
|
|
; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB190_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB191_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v4 clamp
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e64 v0, v1, v4 clamp
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v4 clamp
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB192_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_u32_e64 v0, v1, v4 clamp
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB192_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
|
|
%data = call i32 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i32 %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics i64, with aa+av cases using saddr
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB193_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB193_3
|
|
; GFX90A-NEXT: s_branch .LBB193_4
|
|
; GFX90A-NEXT: .LBB193_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3
|
|
; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX90A-NEXT: buffer_load_dword a2, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword a3, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(2)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB193_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
|
|
; GFX950-NEXT: s_cbranch_execz .LBB193_3
|
|
; GFX950-NEXT: s_branch .LBB193_4
|
|
; GFX950-NEXT: .LBB193_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
|
|
; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0
|
|
; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xchg_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB194_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB194_3
|
|
; GFX90A-NEXT: s_branch .LBB194_4
|
|
; GFX90A-NEXT: .LBB194_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB194_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_nop 0
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB194_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB194_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB194_3
|
|
; GFX950-NEXT: s_branch .LBB194_4
|
|
; GFX950-NEXT: .LBB194_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB194_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB194_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xchg ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB195_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB195_3
|
|
; GFX90A-NEXT: s_branch .LBB195_4
|
|
; GFX90A-NEXT: .LBB195_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB195_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB195_3
|
|
; GFX950-NEXT: s_branch .LBB195_4
|
|
; GFX950-NEXT: .LBB195_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_add_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB196_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB196_3
|
|
; GFX90A-NEXT: s_branch .LBB196_4
|
|
; GFX90A-NEXT: .LBB196_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB196_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB196_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB196_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB196_3
|
|
; GFX950-NEXT: s_branch .LBB196_4
|
|
; GFX950-NEXT: .LBB196_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB196_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB196_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB197_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB197_3
|
|
; GFX90A-NEXT: s_branch .LBB197_4
|
|
; GFX90A-NEXT: .LBB197_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB197_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB197_3
|
|
; GFX950-NEXT: s_branch .LBB197_4
|
|
; GFX950-NEXT: .LBB197_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_sub_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB198_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB198_3
|
|
; GFX90A-NEXT: s_branch .LBB198_4
|
|
; GFX90A-NEXT: .LBB198_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB198_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB198_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB198_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB198_3
|
|
; GFX950-NEXT: s_branch .LBB198_4
|
|
; GFX950-NEXT: .LBB198_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB198_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB198_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB199_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB199_3
|
|
; GFX90A-NEXT: s_branch .LBB199_4
|
|
; GFX90A-NEXT: .LBB199_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v0
|
|
; GFX90A-NEXT: v_and_b32_e32 v1, v4, v1
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB199_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB199_3
|
|
; GFX950-NEXT: s_branch .LBB199_4
|
|
; GFX950-NEXT: .LBB199_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
|
|
; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_and_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB200_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB200_3
|
|
; GFX90A-NEXT: s_branch .LBB200_4
|
|
; GFX90A-NEXT: .LBB200_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB200_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB200_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB200_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB200_3
|
|
; GFX950-NEXT: s_branch .LBB200_4
|
|
; GFX950-NEXT: .LBB200_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB200_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB200_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB201_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB201_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
|
|
; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX90A-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB201_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: s_branch .LBB201_6
|
|
; GFX90A-NEXT: .LBB201_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB201_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v1, v1, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_and_b32_e32 v2, v2, v5
|
|
; GFX90A-NEXT: v_not_b32_e32 v1, v1
|
|
; GFX90A-NEXT: v_not_b32_e32 v2, v2
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB201_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB201_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
|
|
; GFX950-NEXT: v_and_b32_e32 v8, v2, v4
|
|
; GFX950-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX950-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB201_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_branch .LBB201_6
|
|
; GFX950-NEXT: .LBB201_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB201_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
|
|
; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_not_b32_e32 v3, v2
|
|
; GFX950-NEXT: v_not_b32_e32 v2, v4
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_nand_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB202_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB202_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
|
|
; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX90A-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB202_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_branch .LBB202_6
|
|
; GFX90A-NEXT: .LBB202_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB202_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v4, v4
|
|
; GFX90A-NEXT: v_not_b32_e32 v3, v3
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB202_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_nand_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB202_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB202_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
|
|
; GFX950-NEXT: v_and_b32_e32 v8, v2, v4
|
|
; GFX950-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX950-NEXT: v_not_b32_e32 v0, v8
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB202_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_branch .LBB202_6
|
|
; GFX950-NEXT: .LBB202_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB202_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
|
|
; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
|
|
; GFX950-NEXT: v_not_b32_e32 v3, v2
|
|
; GFX950-NEXT: v_not_b32_e32 v2, v4
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB202_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB203_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB203_3
|
|
; GFX90A-NEXT: s_branch .LBB203_4
|
|
; GFX90A-NEXT: .LBB203_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v1, v4, v1
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB203_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB203_3
|
|
; GFX950-NEXT: s_branch .LBB203_4
|
|
; GFX950-NEXT: .LBB203_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_or_b32_e32 v1, v3, v1
|
|
; GFX950-NEXT: v_or_b32_e32 v0, v2, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_or_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB204_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB204_3
|
|
; GFX90A-NEXT: s_branch .LBB204_4
|
|
; GFX90A-NEXT: .LBB204_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB204_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1
|
|
; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB204_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB204_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB204_3
|
|
; GFX950-NEXT: s_branch .LBB204_4
|
|
; GFX950-NEXT: .LBB204_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB204_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB204_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB205_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB205_3
|
|
; GFX90A-NEXT: s_branch .LBB205_4
|
|
; GFX90A-NEXT: .LBB205_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
|
|
; GFX90A-NEXT: v_xor_b32_e32 v0, v3, v0
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v4, v1
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB205_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB205_3
|
|
; GFX950-NEXT: s_branch .LBB205_4
|
|
; GFX950-NEXT: .LBB205_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1
|
|
; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw xor ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_xor_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB206_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB206_3
|
|
; GFX90A-NEXT: s_branch .LBB206_4
|
|
; GFX90A-NEXT: .LBB206_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB206_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1
|
|
; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB206_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB206_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB206_3
|
|
; GFX950-NEXT: s_branch .LBB206_4
|
|
; GFX950-NEXT: .LBB206_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB206_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
|
|
; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB206_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw xor ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB207_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB207_3
|
|
; GFX90A-NEXT: s_branch .LBB207_4
|
|
; GFX90A-NEXT: .LBB207_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB207_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB207_3
|
|
; GFX950-NEXT: s_branch .LBB207_4
|
|
; GFX950-NEXT: .LBB207_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_max_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB208_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB208_3
|
|
; GFX90A-NEXT: s_branch .LBB208_4
|
|
; GFX90A-NEXT: .LBB208_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB208_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB208_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB208_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB208_3
|
|
; GFX950-NEXT: s_branch .LBB208_4
|
|
; GFX950-NEXT: .LBB208_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB208_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB208_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB209_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB209_3
|
|
; GFX90A-NEXT: s_branch .LBB209_4
|
|
; GFX90A-NEXT: .LBB209_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB209_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB209_3
|
|
; GFX950-NEXT: s_branch .LBB209_4
|
|
; GFX950-NEXT: .LBB209_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_min_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB210_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB210_3
|
|
; GFX90A-NEXT: s_branch .LBB210_4
|
|
; GFX90A-NEXT: .LBB210_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB210_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB210_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB210_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB210_3
|
|
; GFX950-NEXT: s_branch .LBB210_4
|
|
; GFX950-NEXT: .LBB210_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB210_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB210_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB211_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB211_3
|
|
; GFX90A-NEXT: s_branch .LBB211_4
|
|
; GFX90A-NEXT: .LBB211_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB211_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB211_3
|
|
; GFX950-NEXT: s_branch .LBB211_4
|
|
; GFX950-NEXT: .LBB211_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umax_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB212_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB212_3
|
|
; GFX90A-NEXT: s_branch .LBB212_4
|
|
; GFX90A-NEXT: .LBB212_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB212_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB212_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB212_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB212_3
|
|
; GFX950-NEXT: s_branch .LBB212_4
|
|
; GFX950-NEXT: .LBB212_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB212_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB212_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB213_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB213_3
|
|
; GFX90A-NEXT: s_branch .LBB213_4
|
|
; GFX90A-NEXT: .LBB213_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB213_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB213_3
|
|
; GFX950-NEXT: s_branch .LBB213_4
|
|
; GFX950-NEXT: .LBB213_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_umin_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB214_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB214_3
|
|
; GFX90A-NEXT: s_branch .LBB214_4
|
|
; GFX90A-NEXT: .LBB214_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB214_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB214_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB214_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB214_3
|
|
; GFX950-NEXT: s_branch .LBB214_4
|
|
; GFX950-NEXT: .LBB214_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB214_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB214_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB215_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB215_3
|
|
; GFX90A-NEXT: s_branch .LBB215_4
|
|
; GFX90A-NEXT: .LBB215_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
|
|
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB215_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB215_3
|
|
; GFX950-NEXT: s_branch .LBB215_4
|
|
; GFX950-NEXT: .LBB215_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1
|
|
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB216_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB216_3
|
|
; GFX90A-NEXT: s_branch .LBB216_4
|
|
; GFX90A-NEXT: .LBB216_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB216_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB216_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB216_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB216_3
|
|
; GFX950-NEXT: s_branch .LBB216_4
|
|
; GFX950-NEXT: .LBB216_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB216_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
|
|
; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB216_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB217_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB217_3
|
|
; GFX90A-NEXT: s_branch .LBB217_4
|
|
; GFX90A-NEXT: .LBB217_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
|
|
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB217_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB217_3
|
|
; GFX950-NEXT: s_branch .LBB217_4
|
|
; GFX950-NEXT: .LBB217_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
|
|
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2
|
|
; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB218_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB218_3
|
|
; GFX90A-NEXT: s_branch .LBB218_4
|
|
; GFX90A-NEXT: .LBB218_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB218_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
|
|
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
|
|
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB218_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB218_3
|
|
; GFX950-NEXT: s_branch .LBB218_4
|
|
; GFX950-NEXT: .LBB218_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB218_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
|
; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
|
|
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
|
|
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2
|
|
; GFX950-NEXT: .LBB218_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB219_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB219_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB219_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: s_branch .LBB219_6
|
|
; GFX90A-NEXT: .LBB219_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB219_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB219_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB219_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB219_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_branch .LBB219_6
|
|
; GFX950-NEXT: .LBB219_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB219_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_cond_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB220_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB220_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB220_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_branch .LBB220_6
|
|
; GFX90A-NEXT: .LBB220_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB220_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
|
|
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB220_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_cond_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB220_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB220_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB220_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_branch .LBB220_6
|
|
; GFX950-NEXT: .LBB220_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB220_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB220_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB221_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB221_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB221_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: s_branch .LBB221_6
|
|
; GFX90A-NEXT: .LBB221_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB221_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB221_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB221_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB221_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_branch .LBB221_6
|
|
; GFX950-NEXT: .LBB221_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB221_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=a"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_usub_sat_i64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB222_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB222_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB222_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_branch .LBB222_6
|
|
; GFX90A-NEXT: .LBB222_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB222_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(1)
|
|
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB222_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_usub_sat_i64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB222_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .LBB222_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB222_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_branch .LBB222_6
|
|
; GFX950-NEXT: .LBB222_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB222_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB222_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
|
|
%data = call i64 asm "; def $0", "=^VA"()
|
|
%result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(i64 %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics f32, with aa+av cases using saddr
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 40
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB223_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB223_4
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB223_5
|
|
; GFX90A-NEXT: s_branch .LBB223_6
|
|
; GFX90A-NEXT: .LBB223_3:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0
|
|
; GFX90A-NEXT: s_branch .LBB223_7
|
|
; GFX90A-NEXT: .LBB223_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0
|
|
; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
|
|
; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB223_6: ; %Flow1
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB223_8
|
|
; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
|
|
; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 40
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB224_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB224_4
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB224_5
|
|
; GFX90A-NEXT: s_branch .LBB224_6
|
|
; GFX90A-NEXT: .LBB224_3:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr1
|
|
; GFX90A-NEXT: s_branch .LBB224_7
|
|
; GFX90A-NEXT: .LBB224_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr1
|
|
; GFX90A-NEXT: .LBB224_5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0
|
|
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: .LBB224_6: ; %Flow1
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB224_8
|
|
; GFX90A-NEXT: .LBB224_7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
|
|
; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: .LBB224_8: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v1
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB225_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_f32_e32 v0, v1, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB225_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB226_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_sub_f32_e32 v0, v1, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB226_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB227_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB227_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB228_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB228_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB229_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB229_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB230_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
|
|
; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB230_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB231_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_maximum3_f32 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB231_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB232_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_maximum3_f32 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB232_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f32_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB233_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f32_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_minimum3_f32 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB233_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "a"(float %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f32_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB234_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f32_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_minimum3_f32 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB234_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
|
|
%data = call float asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
|
|
call void asm "; use $0", "^VA"(float %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics f64, with aa+av cases using saddr
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB235_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB235_4
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB235_5
|
|
; GFX90A-NEXT: s_branch .LBB235_6
|
|
; GFX90A-NEXT: .LBB235_3:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_branch .LBB235_7
|
|
; GFX90A-NEXT: .LBB235_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s6
|
|
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB235_6: ; %Flow1
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB235_8
|
|
; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB235_3
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB235_4
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB235_5
|
|
; GFX950-NEXT: s_branch .LBB235_6
|
|
; GFX950-NEXT: .LBB235_3:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_branch .LBB235_7
|
|
; GFX950-NEXT: .LBB235_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2
|
|
; GFX950-NEXT: .LBB235_6: ; %Flow1
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB235_8
|
|
; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB236_3
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB236_4
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB236_5
|
|
; GFX90A-NEXT: s_branch .LBB236_6
|
|
; GFX90A-NEXT: .LBB236_3:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: s_branch .LBB236_7
|
|
; GFX90A-NEXT: .LBB236_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX90A-NEXT: .LBB236_5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s6
|
|
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
|
|
; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB236_6: ; %Flow1
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB236_8
|
|
; GFX90A-NEXT: .LBB236_7: ; %atomicrmw.shared
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: .LBB236_8: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB236_3
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB236_4
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
|
|
; GFX950-NEXT: s_cbranch_execz .LBB236_5
|
|
; GFX950-NEXT: s_branch .LBB236_6
|
|
; GFX950-NEXT: .LBB236_3:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: s_branch .LBB236_7
|
|
; GFX950-NEXT: .LBB236_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; GFX950-NEXT: .LBB236_5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2
|
|
; GFX950-NEXT: .LBB236_6: ; %Flow1
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB236_8
|
|
; GFX950-NEXT: .LBB236_7: ; %atomicrmw.shared
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: .LBB236_8: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB237_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB237_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB237_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: s_branch .LBB237_6
|
|
; GFX90A-NEXT: .LBB237_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB237_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB237_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB237_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB237_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_branch .LBB237_6
|
|
; GFX950-NEXT: .LBB237_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB237_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_f64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB238_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB238_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_branch .LBB238_6
|
|
; GFX90A-NEXT: .LBB238_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB238_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB238_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_f64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB238_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB238_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_branch .LBB238_6
|
|
; GFX950-NEXT: .LBB238_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB238_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB238_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB239_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB239_3
|
|
; GFX90A-NEXT: s_branch .LBB239_4
|
|
; GFX90A-NEXT: .LBB239_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB239_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB239_3
|
|
; GFX950-NEXT: s_branch .LBB239_4
|
|
; GFX950-NEXT: .LBB239_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_f64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB240_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB240_3
|
|
; GFX90A-NEXT: s_branch .LBB240_4
|
|
; GFX90A-NEXT: .LBB240_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB240_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB240_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB240_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB240_3
|
|
; GFX950-NEXT: s_branch .LBB240_4
|
|
; GFX950-NEXT: .LBB240_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB240_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB240_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB241_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB241_3
|
|
; GFX90A-NEXT: s_branch .LBB241_4
|
|
; GFX90A-NEXT: .LBB241_2:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB241_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: s_cbranch_execz .LBB241_3
|
|
; GFX950-NEXT: s_branch .LBB241_4
|
|
; GFX950-NEXT: .LBB241_2:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
|
|
; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
|
; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_f64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[2:3]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB242_2
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB242_3
|
|
; GFX90A-NEXT: s_branch .LBB242_4
|
|
; GFX90A-NEXT: .LBB242_2:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: .LBB242_3: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB242_4: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[2:3]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB242_2
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX950-NEXT: s_cbranch_execz .LBB242_3
|
|
; GFX950-NEXT: s_branch .LBB242_4
|
|
; GFX950-NEXT: .LBB242_2:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: .LBB242_3: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB242_4: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB243_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB243_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB243_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: s_branch .LBB243_6
|
|
; GFX90A-NEXT: .LBB243_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB243_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB243_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB243_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB243_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_branch .LBB243_6
|
|
; GFX950-NEXT: .LBB243_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB243_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_f64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB244_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB244_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_branch .LBB244_6
|
|
; GFX90A-NEXT: .LBB244_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB244_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB244_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_f64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB244_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB244_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_branch .LBB244_6
|
|
; GFX950-NEXT: .LBB244_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB244_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB244_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f64_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB245_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB245_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB245_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: s_branch .LBB245_6
|
|
; GFX90A-NEXT: .LBB245_4:
|
|
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB245_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f64_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB245_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB245_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB245_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: s_branch .LBB245_6
|
|
; GFX950-NEXT: .LBB245_4:
|
|
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB245_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(double %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_f64_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: s_add_u32 s4, s16, 0x50
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
|
|
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
|
|
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
|
|
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
|
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v[4:5]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_cbranch_vccz .LBB246_4
|
|
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB246_2
|
|
; GFX90A-NEXT: ; %bb.3: ; %Flow
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_branch .LBB246_6
|
|
; GFX90A-NEXT: .LBB246_4:
|
|
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX90A-NEXT: s_cbranch_execz .LBB246_6
|
|
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
|
|
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
|
|
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
|
|
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
|
|
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
|
|
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
|
|
; GFX90A-NEXT: .LBB246_6: ; %atomicrmw.phi
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v[0:1]
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_f64_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
|
|
; GFX950-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX950-NEXT: s_cmp_eq_u32 s1, s3
|
|
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
|
|
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v[4:5]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_cbranch_vccz .LBB246_4
|
|
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
|
|
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
|
|
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
|
|
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB246_2
|
|
; GFX950-NEXT: ; %bb.3: ; %Flow
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_branch .LBB246_6
|
|
; GFX950-NEXT: .LBB246_4:
|
|
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; GFX950-NEXT: s_cbranch_execz .LBB246_6
|
|
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
|
|
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
|
|
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
|
|
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
|
|
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
|
|
; GFX950-NEXT: s_nop 1
|
|
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
|
|
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
|
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
|
|
; GFX950-NEXT: .LBB246_6: ; %atomicrmw.phi
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v[0:1]
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
|
|
%data = call double asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(double %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics v2f16, with aa+av cases using saddr
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2f16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB247_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB247_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2f16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2f16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB248_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB248_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2f16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2f16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB249_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2f16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2f16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB250_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2f16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB250_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2f16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2f16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_max_f16 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2f16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB252_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2f16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_max_f16 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB252_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2f16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2f16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_min_f16 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2f16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB254_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2f16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_pk_min_f16 v0, v0, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB254_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB255_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB256_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB256_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB257_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_min_f16 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
|
|
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v4
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_min_f16 v0, v1, v4
|
|
; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB258_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v4
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v4, v4
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB258_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x half> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x half> %result)
|
|
ret void
|
|
}
|
|
|
|
;---------------------------------------------------------------------
|
|
; other atomics v2bf16, with aa+av cases using saddr
|
|
;---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB259_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_nop 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
|
|
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fadd_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB260_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v2
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB261_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB261_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_sub_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB262_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_sub_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB262_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB263_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB263_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB264_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB264_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB265_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB265_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
|
|
; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB266_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
|
|
; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB266_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB267_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_max_f32_e32 v8, v0, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB267_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4
|
|
; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_max_f32_e32 v8, v0, v4
|
|
; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB268_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4
|
|
; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB268_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_a_a:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB269_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_min_f32_e32 v8, v0, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use a0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_a_a:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB269_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_minimum3_f32 v0, v0, v4, v4
|
|
; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use a0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=a"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "a"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
|
|
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_av_av:
|
|
; GFX90A: ; %bb.0:
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; def v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
|
|
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
|
|
; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
|
|
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
|
|
; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
|
|
; GFX90A-NEXT: v_min_f32_e32 v8, v0, v4
|
|
; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
|
|
; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
|
|
; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
|
|
; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
|
|
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
|
|
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
|
|
; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
|
|
; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
|
|
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
|
|
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
|
|
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
|
|
; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
|
|
; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
|
|
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB270_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX90A-NEXT: ;;#ASMSTART
|
|
; GFX90A-NEXT: ; use v0
|
|
; GFX90A-NEXT: ;;#ASMEND
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_av_av:
|
|
; GFX950: ; %bb.0:
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; def v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX950-NEXT: .p2align 5, , 4
|
|
; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start
|
|
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
|
|
; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
|
|
; GFX950-NEXT: v_minimum3_f32 v0, v0, v4, v4
|
|
; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
|
|
; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
|
|
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
|
|
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
|
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX950-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: s_cbranch_execnz .LBB270_1
|
|
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX950-NEXT: ;;#ASMSTART
|
|
; GFX950-NEXT: ; use v0
|
|
; GFX950-NEXT: ;;#ASMEND
|
|
; GFX950-NEXT: s_setpc_b64 s[30:31]
|
|
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
|
|
%data = call <2 x bfloat> asm "; def $0", "=^VA"()
|
|
%result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
call void asm "; use $0", "^VA"(<2 x bfloat> %result)
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
|
|
|
|
!0 = !{}
|
|
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; CHECK: {{.*}}
|