
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
2383 lines
102 KiB
LLVM
2383 lines
102 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX942
|
|
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX1250
|
|
|
|
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
|
|
declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
|
|
|
|
define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_add_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_mov_b32 s6, 4
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_mov_b32 s6, 4
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_add_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_min_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_mov_b32 s6, 4
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_mov_b32 s6, 4
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_min_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_max_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_mov_b32 s6, 4
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_mov_b32 s6, 4
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_max_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s8
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
|
|
store double %ret, ptr poison
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
|
|
; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX942-NEXT: s_load_dword s10, s[4:5], 0x3c
|
|
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
|
|
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s10
|
|
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
|
|
store double %ret, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_noret_pat:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_agent:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_system:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, double %data) #1 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, double %data) #1 {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_rtn_pat_system:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_rtn_pat_system:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
|
|
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
|
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_system:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: buffer_wbl2
|
|
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-NEXT: buffer_invl2
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: buffer_wbl2 sc0 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc0 sc1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
|
|
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
|
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
|
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: buffer_wbinvl1_vol
|
|
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
|
; GFX942-NEXT: buffer_wbl2 sc1
|
|
; GFX942-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: buffer_inv sc1
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
|
|
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_noret:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24
|
|
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_noret:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dword s2, s[4:5], 0x24
|
|
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_noret:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_clause 0x1
|
|
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24
|
|
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
|
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
|
|
ret void
|
|
}
|
|
|
|
define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_rtn:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_rtn:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
|
|
ret double %ret
|
|
}
|
|
|
|
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_noret_pat:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_noret_pat_flush:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_endpgm
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX942-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_endpgm
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
|
|
ret void
|
|
}
|
|
|
|
define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data) #1 {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_rtn_pat:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_rtn_pat:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
|
|
ret double %ret
|
|
}
|
|
|
|
define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
|
|
ret double %ret
|
|
}
|
|
|
|
define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
|
|
; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
|
|
; GFX90A: ; %bb.0: ; %main_body
|
|
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX942-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
|
|
; GFX942: ; %bb.0: ; %main_body
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: v_mov_b32_e32 v3, v2
|
|
; GFX942-NEXT: v_mov_b32_e32 v2, v1
|
|
; GFX942-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
|
|
; GFX1250: ; %bb.0: ; %main_body
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
|
|
; GFX1250-NEXT: s_wait_storecnt 0x0
|
|
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
|
|
; GFX1250-NEXT: s_wait_dscnt 0x0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
main_body:
|
|
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
|
|
ret double %ret
|
|
}
|
|
|
|
attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
|
|
attributes #1 = { nounwind }
|
|
attributes #2 = { "denormal-fp-math"="ieee,ieee" }
|
|
attributes #3 = { "denormal-fp-math"="ieee,ieee" }
|
|
attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
|
|
|
|
!0 = !{}
|
|
!1 = !{i32 5, i32 6}
|