
Recent upstream trends have moved away from explicitly using `-verify-machineinstrs`, as it's already covered by the expensive checks. This PR removes almost all `-verify-machineinstrs` from tests in `llvm/test/CodeGen/AMDGPU/*.ll`, leaving only those tests where its removal currently causes failures.
683 lines
27 KiB
LLVM
683 lines
27 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=CI %s
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
|
|
|
|
%struct.lds = type { [64 x ptr], [16 x i8] }
|
|
@stored_lds_struct = addrspace(3) global %struct.lds poison, align 16
|
|
@stored_lds_ptr = addrspace(3) global ptr addrspace(3) poison, align 4
|
|
@stored_constant_ptr = addrspace(3) global ptr addrspace(4) poison, align 8
|
|
@stored_global_ptr = addrspace(3) global ptr addrspace(1) poison, align 8
|
|
|
|
define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 {
|
|
; CI-LABEL: no_reorder_flat_load_local_store_local_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
|
; CI-NEXT: v_mov_b32_e32 v4, 0
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_mov_b32_e32 v0, s0
|
|
; CI-NEXT: v_mov_b32_e32 v1, s1
|
|
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; CI-NEXT: s_load_dword s0, s[4:5], 0x9
|
|
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CI-NEXT: ds_write_b128 v4, v[0:3] offset:512
|
|
; CI-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130
|
|
; CI-NEXT: v_mov_b32_e32 v2, s0
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: no_reorder_flat_load_local_store_local_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b128 v4, v[0:3] offset:512
|
|
; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:129 offset1:130
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1
|
|
%ptr2 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1, i32 4
|
|
call void @llvm.memcpy.p3.p0(ptr addrspace(3) align 16 %ptr1, ptr align 8 %fptr, i64 16, i1 false)
|
|
%vector_load = load <2 x i32>, ptr addrspace(3) %ptr2, align 4
|
|
store <2 x i32> %vector_load, ptr addrspace(3) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
|
|
; CI-LABEL: reorder_local_load_global_store_local_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: ds_read_b32 v0, v0
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x63
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:3
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_mov_b32 s0, s2
|
|
; CI-NEXT: s_mov_b32 s1, s3
|
|
; CI-NEXT: s_mov_b32 s2, s6
|
|
; CI-NEXT: s_mov_b32 s3, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
|
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_local_load_global_store_local_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: ds_read_b32 v0, v2
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, 0x63
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:3
|
|
; GFX9-NEXT: global_store_dword v2, v3, s[2:3]
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
|
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
|
|
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
|
|
|
|
%tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
|
|
store i32 99, ptr addrspace(1) %gptr, align 4
|
|
%tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
|
|
; CI-LABEL: no_reorder_local_load_volatile_global_store_local_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: ds_read_b32 v0, v0
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x63
|
|
; CI-NEXT: ds_read_b32 v1, v0 offset:4
|
|
; CI-NEXT: buffer_store_dword v2, off, s[8:11], 0
|
|
; CI-NEXT: s_waitcnt vmcnt(0)
|
|
; CI-NEXT: ds_read_b32 v0, v0 offset:12
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: no_reorder_local_load_volatile_global_store_local_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: ds_read_b32 v1, v0
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0x63
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_read_b32 v3, v1 offset:4
|
|
; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: ds_read_b32 v1, v1 offset:12
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
|
|
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
|
|
|
|
%tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
|
|
store volatile i32 99, ptr addrspace(1) %gptr, align 4
|
|
%tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
|
|
; CI-LABEL: no_reorder_barrier_local_load_global_store_local_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: ds_read_b32 v0, v0
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x63
|
|
; CI-NEXT: ds_read_b32 v1, v0 offset:4
|
|
; CI-NEXT: buffer_store_dword v2, off, s[8:11], 0
|
|
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CI-NEXT: s_barrier
|
|
; CI-NEXT: ds_read_b32 v0, v0 offset:12
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: no_reorder_barrier_local_load_global_store_local_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: ds_read_b32 v1, v0
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0x63
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_read_b32 v3, v1 offset:4
|
|
; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: s_barrier
|
|
; GFX9-NEXT: ds_read_b32 v1, v1 offset:12
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
|
|
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
|
|
|
|
%tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
|
|
store i32 99, ptr addrspace(1) %gptr, align 4
|
|
call void @llvm.amdgcn.s.barrier() #1
|
|
%tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
|
|
; CI-LABEL: reorder_constant_load_global_store_constant_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: ds_read_b64 v[0:1], v0
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: v_readfirstlane_b32 s2, v0
|
|
; CI-NEXT: v_readfirstlane_b32 s3, v1
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x63
|
|
; CI-NEXT: s_load_dword s12, s[2:3], 0x1
|
|
; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
|
; CI-NEXT: s_load_dword s2, s[2:3], 0x3
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_add_i32 s0, s12, s2
|
|
; CI-NEXT: v_mov_b32_e32 v0, s0
|
|
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_constant_load_global_store_constant_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: ds_read_b64 v[0:1], v2
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, 0x63
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4
|
|
; GFX9-NEXT: global_store_dword v2, v3, s[2:3]
|
|
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_add_i32 s2, s6, s2
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8
|
|
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 3
|
|
|
|
%tmp1 = load i32, ptr addrspace(4) %ptr1, align 4
|
|
store i32 99, ptr addrspace(1) %gptr, align 4
|
|
%tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr) #0 {
|
|
; CI-LABEL: reorder_constant_load_local_store_constant_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: ds_read_b64 v[0:1], v0
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_load_dword s6, s[4:5], 0xb
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_readfirstlane_b32 s4, v0
|
|
; CI-NEXT: v_readfirstlane_b32 s5, v1
|
|
; CI-NEXT: s_load_dword s7, s[4:5], 0x1
|
|
; CI-NEXT: s_load_dword s4, s[4:5], 0x3
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x63
|
|
; CI-NEXT: v_mov_b32_e32 v1, s6
|
|
; CI-NEXT: ds_write_b32 v1, v0
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_add_i32 s4, s7, s4
|
|
; CI-NEXT: v_mov_b32_e32 v0, s4
|
|
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_constant_load_local_store_constant_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: ds_read_b64 v[0:1], v2
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x4
|
|
; GFX9-NEXT: s_load_dword s8, s[0:1], 0xc
|
|
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0x63
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s6
|
|
; GFX9-NEXT: ds_write_b32 v1, v0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_add_i32 s0, s7, s8
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8
|
|
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 3
|
|
|
|
%tmp1 = load i32, ptr addrspace(4) %ptr1, align 4
|
|
store i32 99, ptr addrspace(3) %lptr, align 4
|
|
%tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace(1) %out, ptr addrspace(3) noalias %lptr, ptr addrspace(4) %ptr0) #0 {
|
|
; CI-LABEL: reorder_smrd_load_local_store_smrd_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_load_dword s8, s[4:5], 0xb
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x63
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x1
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: v_mov_b32_e32 v1, s8
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: ds_write_b32 v1, v0
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_add_i32 s4, s4, s5
|
|
; CI-NEXT: v_mov_b32_e32 v0, s4
|
|
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_smrd_load_local_store_smrd_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, 0x63
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s6
|
|
; GFX9-NEXT: ds_write_b32 v2, v1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_add_i32 s0, s4, s5
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2
|
|
|
|
%tmp1 = load i32, ptr addrspace(4) %ptr1, align 4
|
|
store i32 99, ptr addrspace(3) %lptr, align 4
|
|
%tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_global_load_local_store_global_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr, ptr addrspace(1) %ptr0) #0 {
|
|
; CI-LABEL: reorder_global_load_local_store_global_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s2, s6
|
|
; CI-NEXT: s_mov_b32 s3, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
|
|
; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12
|
|
; CI-NEXT: s_load_dword s0, s[4:5], 0xb
|
|
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x63
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_mov_b32_e32 v3, s0
|
|
; CI-NEXT: ds_write_b32 v3, v2
|
|
; CI-NEXT: s_waitcnt vmcnt(0)
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
|
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_global_load_local_store_global_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, 0x63
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:4
|
|
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:12
|
|
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX9-NEXT: ds_write_b32 v4, v3
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 3
|
|
|
|
%tmp1 = load i32, ptr addrspace(1) %ptr1, align 4
|
|
store i32 99, ptr addrspace(3) %lptr, align 4
|
|
%tmp2 = load i32, ptr addrspace(1) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_local_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(3) noalias nocapture %ptr0) #0 {
|
|
; CI-LABEL: reorder_local_offsets:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dword s6, s[4:5], 0xd
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_mov_b32_e32 v3, s6
|
|
; CI-NEXT: ds_read2_b32 v[0:1], v3 offset0:100 offset1:102
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: ds_write2_b32 v3, v2, v2 offset0:3 offset1:100
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x315
|
|
; CI-NEXT: ds_write_b32 v3, v2 offset:408
|
|
; CI-NEXT: s_waitcnt lgkmcnt(2)
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0
|
|
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_local_offsets:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
|
; GFX9-NEXT: ds_read2_b32 v[0:1], v3 offset0:100 offset1:102
|
|
; GFX9-NEXT: ds_write2_b32 v3, v4, v4 offset0:3 offset1:100
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, 0x315
|
|
; GFX9-NEXT: ds_write_b32 v3, v4 offset:408
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
|
|
; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
|
|
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 100
|
|
%ptr3 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 102
|
|
|
|
store i32 123, ptr addrspace(3) %ptr1, align 4
|
|
%tmp1 = load i32, ptr addrspace(3) %ptr2, align 4
|
|
%tmp2 = load i32, ptr addrspace(3) %ptr3, align 4
|
|
store i32 123, ptr addrspace(3) %ptr2, align 4
|
|
%tmp3 = load i32, ptr addrspace(3) %ptr1, align 4
|
|
store i32 789, ptr addrspace(3) %ptr3, align 4
|
|
|
|
%add.0 = add nsw i32 %tmp2, %tmp1
|
|
%add.1 = add nsw i32 %add.0, %tmp3
|
|
store i32 %add.1, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_global_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(1) noalias nocapture %ptr0) #0 {
|
|
; CI-LABEL: reorder_global_offsets:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s2, s6
|
|
; CI-NEXT: s_mov_b32 s3, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:400
|
|
; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:408
|
|
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
|
|
; CI-NEXT: v_mov_b32_e32 v3, 0x315
|
|
; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12
|
|
; CI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:400
|
|
; CI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:408
|
|
; CI-NEXT: s_waitcnt vmcnt(3)
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; CI-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_global_offsets:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:400
|
|
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:408
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:12
|
|
; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:400
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, 0x315
|
|
; GFX9-NEXT: global_store_dword v0, v3, s[0:1] offset:408
|
|
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
|
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
|
; GFX9-NEXT: v_add_u32_e32 v1, 0x7b, v1
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 100
|
|
%ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 102
|
|
|
|
store i32 123, ptr addrspace(1) %ptr1, align 4
|
|
%tmp1 = load i32, ptr addrspace(1) %ptr2, align 4
|
|
%tmp2 = load i32, ptr addrspace(1) %ptr3, align 4
|
|
store i32 123, ptr addrspace(1) %ptr2, align 4
|
|
%tmp3 = load i32, ptr addrspace(1) %ptr1, align 4
|
|
store i32 789, ptr addrspace(1) %ptr3, align 4
|
|
|
|
%add.0 = add nsw i32 %tmp2, %tmp1
|
|
%add.1 = add nsw i32 %add.0, %tmp3
|
|
store i32 %add.1, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(ptr addrspace(1) noalias nocapture %ptr.base) #0 {
|
|
; CI-LABEL: reorder_global_offsets_addr64_soffset0:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, 0
|
|
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:12
|
|
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:28
|
|
; CI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:44
|
|
; CI-NEXT: v_mov_b32_e32 v5, 0x315
|
|
; CI-NEXT: v_mov_b32_e32 v6, 0x7b
|
|
; CI-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64
|
|
; CI-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 offset:20
|
|
; CI-NEXT: s_waitcnt vmcnt(3)
|
|
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
|
|
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:36
|
|
; CI-NEXT: s_waitcnt vmcnt(3)
|
|
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
|
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:52
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_global_offsets_addr64_soffset0:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, 0x315
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7b
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:12
|
|
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:28
|
|
; GFX9-NEXT: global_load_dword v3, v0, s[0:1] offset:44
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: global_store_dword v0, v4, s[0:1]
|
|
; GFX9-NEXT: global_store_dword v0, v5, s[0:1] offset:20
|
|
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
|
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:36
|
|
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
|
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
|
|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:52
|
|
; GFX9-NEXT: s_endpgm
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%id.ext = sext i32 %id to i64
|
|
|
|
%ptr0 = getelementptr inbounds i32, ptr addrspace(1) %ptr.base, i64 %id.ext
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 5
|
|
%ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 7
|
|
%ptr4 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 9
|
|
%ptr5 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 11
|
|
%ptr6 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 13
|
|
|
|
store i32 789, ptr addrspace(1) %ptr0, align 4
|
|
%tmp1 = load i32, ptr addrspace(1) %ptr1, align 4
|
|
store i32 123, ptr addrspace(1) %ptr2, align 4
|
|
%tmp2 = load i32, ptr addrspace(1) %ptr3, align 4
|
|
%add.0 = add nsw i32 %tmp1, %tmp2
|
|
store i32 %add.0, ptr addrspace(1) %ptr4, align 4
|
|
%tmp3 = load i32, ptr addrspace(1) %ptr5, align 4
|
|
%add.1 = add nsw i32 %add.0, %tmp3
|
|
store i32 %add.1, ptr addrspace(1) %ptr6, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(ptr addrspace(1) %out, i32 %a1, i32 %vaddr) #0 {
|
|
; CI-LABEL: reorder_local_load_tbuffer_store_local_load:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: v_mov_b32_e32 v4, stored_lds_ptr@abs32@lo
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: ds_read_b32 v4, v4
|
|
; CI-NEXT: s_mov_b32 s2, 0
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s0, s2
|
|
; CI-NEXT: s_mov_b32 s1, s2
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: ds_read2_b32 v[4:5], v4 offset0:1 offset1:2
|
|
; CI-NEXT: v_add_i32_e32 v3, vcc, 32, v3
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: tbuffer_store_format_xyzw v[2:5], v3, s[0:3], 0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_SNORM_OGL] idxen glc slc
|
|
; CI-NEXT: s_nop 0
|
|
; CI-NEXT: v_add_i32_e32 v2, vcc, v4, v5
|
|
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
|
; CI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: reorder_local_load_tbuffer_store_local_load:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, stored_lds_ptr@abs32@lo
|
|
; GFX9-NEXT: ds_read_b32 v4, v4
|
|
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset0:1 offset1:2
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: tbuffer_store_format_xyzw v[2:5], v3, s[0:3], 0 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_RESERVED_6] idxen glc slc
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v4, v5
|
|
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
|
; GFX9-NEXT: s_endpgm
|
|
%ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
|
|
|
|
%ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
|
|
%ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 2
|
|
|
|
%tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
|
|
|
|
%vdata = insertelement <4 x i32> poison, i32 %a1, i32 0
|
|
%vaddr.add = add i32 %vaddr, 32
|
|
call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4i32(<4 x i32> %vdata, ptr addrspace(8) poison, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3)
|
|
|
|
%tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
|
|
|
|
%add = add nsw i32 %tmp1, %tmp2
|
|
store i32 %add, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p3.p0(ptr addrspace(3), ptr, i64, i1)
|
|
declare void @llvm.amdgcn.s.barrier() #1
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
|
declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v4i32(<4 x i32>, ptr addrspace(8), i32, i32, i32, i32 immarg, i32 immarg) #3
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { convergent nounwind willreturn }
|
|
attributes #2 = { nounwind readnone speculatable willreturn }
|
|
attributes #3 = { nounwind willreturn writeonly }
|