AMDGPU: Report unaligned scratch access as fast if supported by tgt (#158036)

This enables more consecutive load folding during
aggressive-instcombine.

The original motivating example provided by Jeff Byrnes:
https://godbolt.org/z/8ebcTEjTs

Example provided by Nikita Popov: https://godbolt.org/z/Gv1j4vjqE as
part of my original attempt to fix the issue (PR
[#133301](https://github.com/llvm/llvm-project/pull/133301), see his
[comment](https://github.com/llvm/llvm-project/pull/133301#issuecomment-2984905809)).

This changes the value of `IsFast` returned by `In
SITargetLowering::allowsMisalignedMemoryAccessesImpl` to be non-zero for
private and flat addresses if the subtarget supports unaligned scratch
accesses.

This enables aggressive-instcombine to do more folding of consecutive
loads (see
[here](cbd496581f/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp (L811))).

Summary performance impact on
[composable_kernel](https://github.com/ROCm/composable_kernel):

|GPU|speedup (geomean*)|
|---|---|
|MI300A| 1.11|
|MI300X| 1.14|
|MI350X| 1.03|

[*] Just to be clear, this is the geomean across kernels which were
impacted by this change - not across all CK kernels.
This commit is contained in:
macurtis-amd 2025-09-15 05:03:02 -05:00 committed by GitHub
parent d8c2607fb1
commit 2c091e6aec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 594 additions and 440 deletions

View File

@ -2098,10 +2098,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
bool AlignedBy4 = Alignment >= Align(4);
if (Subtarget->hasUnalignedScratchAccessEnabled()) {
if (IsFast)
*IsFast = AlignedBy4 ? Size : 1;
return true;
}
if (IsFast)
*IsFast = AlignedBy4;
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
return AlignedBy4;
}
// So long as they are correct, wide global memory operations perform better

View File

@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; MUBUF-LABEL: memcpy_fixed_align:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
; MUBUF-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
; MUBUF-NEXT: s_lshr_b32 s4, s32, 6
; MUBUF-NEXT: s_waitcnt vmcnt(2)
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
; MUBUF-NEXT: s_waitcnt vmcnt(3)
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(6)
; MUBUF-NEXT: s_waitcnt vmcnt(5)
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_waitcnt vmcnt(8)
; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:36
; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:32
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:28
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use s4
; MUBUF-NEXT: ;;#ASMEND
@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
; FLATSCR-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
; FLATSCR-NEXT: s_mov_b32 s0, s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s32 offset:24
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s0
; FLATSCR-NEXT: ;;#ASMEND

View File

@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
; CHECK-NEXT: v_mov_b32_e32 v12, s1
; CHECK-NEXT: v_mov_b32_e32 v11, s0
; CHECK-NEXT: v_mov_b32_e32 v9, s3
; CHECK-NEXT: v_mov_b32_e32 v8, s2
; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
; CHECK-NEXT: v_mov_b32_e32 v9, s1
; CHECK-NEXT: v_mov_b32_e32 v8, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: v_mov_b32_e32 v26, s0
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(18)
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
; CHECK-NEXT: v_mov_b32_e32 v12, s1
; CHECK-NEXT: v_mov_b32_e32 v11, s0
; CHECK-NEXT: v_mov_b32_e32 v9, s3
; CHECK-NEXT: v_mov_b32_e32 v8, s2
; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
; CHECK-NEXT: v_mov_b32_e32 v9, s1
; CHECK-NEXT: v_mov_b32_e32 v8, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: v_mov_b32_e32 v26, s0
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(18)
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64

View File

@ -27,19 +27,16 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -83,19 +80,16 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -239,19 +233,16 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -295,19 +286,16 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -437,7 +425,7 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@ -451,19 +439,15 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
; CHECK-NEXT: ds_read_b128 v[3:6], v2
; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -475,8 +459,8 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@ -492,7 +476,7 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@ -506,19 +490,15 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
; CHECK-NEXT: ds_read_b128 v[3:6], v2
; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -530,8 +510,8 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@ -643,12 +623,9 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -660,24 +637,16 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24
; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -689,18 +658,13 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -712,12 +676,9 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -729,24 +690,16 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24
; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -758,18 +711,13 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -895,22 +843,20 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -964,22 +910,20 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@ -1161,15 +1105,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23
; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@ -1211,15 +1155,15 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23
; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@ -1929,18 +1873,18 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
@ -1994,18 +1938,18 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
@ -3267,19 +3211,16 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@ -3334,19 +3275,16 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@ -3525,24 +3463,21 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@ -3592,24 +3527,21 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@ -3783,25 +3715,20 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@ -3850,25 +3777,20 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@ -4037,24 +3959,21 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@ -4104,24 +4023,21 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@ -4302,34 +4218,31 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(8)
; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@ -4398,34 +4311,31 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(8)
; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)

View File

@ -471,7 +471,7 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@ -489,7 +489,7 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
@ -509,8 +509,8 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@ -526,7 +526,7 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@ -544,7 +544,7 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
@ -564,8 +564,8 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@ -2077,21 +2077,23 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@ -2143,21 +2145,23 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)

View File

@ -0,0 +1,234 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=sroa,instcombine,aggressive-instcombine %s -S -o - | FileCheck %s
define i64 @quux(ptr %arg) {
; CHECK-LABEL: define i64 @quux(
; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[ARG]], align 1
; CHECK-NEXT: ret i64 [[LOAD]]
;
bb:
%load = load i8, ptr %arg, align 1
%getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 1
%load1 = load i8, ptr %getelementptr, align 1
%getelementptr2 = getelementptr inbounds nuw i8, ptr %arg, i64 2
%load3 = load i8, ptr %getelementptr2, align 1
%getelementptr4 = getelementptr inbounds nuw i8, ptr %arg, i64 3
%load5 = load i8, ptr %getelementptr4, align 1
%getelementptr6 = getelementptr inbounds nuw i8, ptr %arg, i64 4
%load7 = load i8, ptr %getelementptr6, align 1
%getelementptr8 = getelementptr inbounds nuw i8, ptr %arg, i64 5
%load9 = load i8, ptr %getelementptr8, align 1
%getelementptr10 = getelementptr inbounds nuw i8, ptr %arg, i64 6
%load11 = load i8, ptr %getelementptr10, align 1
%getelementptr12 = getelementptr inbounds nuw i8, ptr %arg, i64 7
%load13 = load i8, ptr %getelementptr12, align 1
%zext = zext i8 %load13 to i64
%shl = shl nuw i64 %zext, 56
%zext14 = zext i8 %load11 to i64
%shl15 = shl nuw nsw i64 %zext14, 48
%or = or disjoint i64 %shl, %shl15
%zext16 = zext i8 %load9 to i64
%shl17 = shl nuw nsw i64 %zext16, 40
%or18 = or disjoint i64 %or, %shl17
%zext19 = zext i8 %load7 to i64
%shl20 = shl nuw nsw i64 %zext19, 32
%or21 = or disjoint i64 %or18, %shl20
%zext22 = zext i8 %load5 to i64
%shl23 = shl nuw nsw i64 %zext22, 24
%or24 = or disjoint i64 %or21, %shl23
%zext25 = zext i8 %load3 to i64
%shl26 = shl nuw nsw i64 %zext25, 16
%zext27 = zext i8 %load1 to i64
%shl28 = shl nuw nsw i64 %zext27, 8
%or29 = or disjoint i64 %or24, %shl26
%zext30 = zext i8 %load to i64
%or31 = or i64 %or29, %shl28
%or32 = or i64 %or31, %zext30
ret i64 %or32
}
; The following test case reduced from a client kernel
define fastcc <16 x float> @hoge(ptr %arg) {
; CHECK-LABEL: define fastcc <16 x float> @hoge(
; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
; CHECK-NEXT: [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
; CHECK-NEXT: [[GETELEMENTPTR72:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
; CHECK-NEXT: [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
; CHECK-NEXT: [[GETELEMENTPTR120:%.*]] = getelementptr i8, ptr [[LOAD]], i64 16
; CHECK-NEXT: [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
; CHECK-NEXT: [[GETELEMENTPTR168:%.*]] = getelementptr i8, ptr [[LOAD]], i64 24
; CHECK-NEXT: [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
; CHECK-NEXT: [[CALL:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
; CHECK-NEXT: [[CALL225:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 0, <16 x float> [[CALL]], i32 0, i32 0, i32 0)
; CHECK-NEXT: [[CALL230:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 0, <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
; CHECK-NEXT: [[CALL235:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 0, <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <16 x float> [[CALL235]]
;
bb:
%load = load ptr, ptr %arg, align 8
%load28 = load i8, ptr %load, align 1
%getelementptr30 = getelementptr i8, ptr %load, i64 1
%load31 = load i8, ptr %getelementptr30, align 1
%getelementptr36 = getelementptr i8, ptr %load, i64 2
%load37 = load i8, ptr %getelementptr36, align 1
%getelementptr42 = getelementptr i8, ptr %load, i64 3
%load43 = load i8, ptr %getelementptr42, align 1
%getelementptr48 = getelementptr i8, ptr %load, i64 4
%load49 = load i8, ptr %getelementptr48, align 1
%getelementptr54 = getelementptr i8, ptr %load, i64 5
%load55 = load i8, ptr %getelementptr54, align 1
%getelementptr60 = getelementptr i8, ptr %load, i64 6
%load61 = load i8, ptr %getelementptr60, align 1
%getelementptr66 = getelementptr i8, ptr %load, i64 7
%load67 = load i8, ptr %getelementptr66, align 1
%getelementptr72 = getelementptr i8, ptr %load, i64 8
%load73 = load i8, ptr %getelementptr72, align 1
%getelementptr78 = getelementptr i8, ptr %load, i64 9
%load79 = load i8, ptr %getelementptr78, align 1
%getelementptr84 = getelementptr i8, ptr %load, i64 10
%load85 = load i8, ptr %getelementptr84, align 1
%getelementptr90 = getelementptr i8, ptr %load, i64 11
%load91 = load i8, ptr %getelementptr90, align 1
%getelementptr96 = getelementptr i8, ptr %load, i64 12
%load97 = load i8, ptr %getelementptr96, align 1
%getelementptr102 = getelementptr i8, ptr %load, i64 13
%load103 = load i8, ptr %getelementptr102, align 1
%getelementptr108 = getelementptr i8, ptr %load, i64 14
%load109 = load i8, ptr %getelementptr108, align 1
%getelementptr114 = getelementptr i8, ptr %load, i64 15
%load115 = load i8, ptr %getelementptr114, align 1
%getelementptr120 = getelementptr i8, ptr %load, i64 16
%load121 = load i8, ptr %getelementptr120, align 1
%getelementptr126 = getelementptr i8, ptr %load, i64 17
%load127 = load i8, ptr %getelementptr126, align 1
%getelementptr132 = getelementptr i8, ptr %load, i64 18
%load133 = load i8, ptr %getelementptr132, align 1
%getelementptr138 = getelementptr i8, ptr %load, i64 19
%load139 = load i8, ptr %getelementptr138, align 1
%getelementptr144 = getelementptr i8, ptr %load, i64 20
%load145 = load i8, ptr %getelementptr144, align 1
%getelementptr150 = getelementptr i8, ptr %load, i64 21
%load151 = load i8, ptr %getelementptr150, align 1
%getelementptr156 = getelementptr i8, ptr %load, i64 22
%load157 = load i8, ptr %getelementptr156, align 1
%getelementptr162 = getelementptr i8, ptr %load, i64 23
%load163 = load i8, ptr %getelementptr162, align 1
%getelementptr168 = getelementptr i8, ptr %load, i64 24
%load169 = load i8, ptr %getelementptr168, align 1
%getelementptr174 = getelementptr i8, ptr %load, i64 25
%load175 = load i8, ptr %getelementptr174, align 1
%getelementptr180 = getelementptr i8, ptr %load, i64 26
%load181 = load i8, ptr %getelementptr180, align 1
%getelementptr186 = getelementptr i8, ptr %load, i64 27
%load187 = load i8, ptr %getelementptr186, align 1
%getelementptr192 = getelementptr i8, ptr %load, i64 28
%load193 = load i8, ptr %getelementptr192, align 1
%getelementptr198 = getelementptr i8, ptr %load, i64 29
%load199 = load i8, ptr %getelementptr198, align 1
%getelementptr204 = getelementptr i8, ptr %load, i64 30
%load205 = load i8, ptr %getelementptr204, align 1
%getelementptr210 = getelementptr i8, ptr %load, i64 31
%load211 = load i8, ptr %getelementptr210, align 1
%alloca1.sroa.8.0.insert.ext = zext i8 %load67 to i64
%alloca1.sroa.8.0.insert.shift = shl i64 %alloca1.sroa.8.0.insert.ext, 56
%alloca1.sroa.7.0.insert.ext = zext i8 %load61 to i64
%alloca1.sroa.7.0.insert.shift = shl i64 %alloca1.sroa.7.0.insert.ext, 48
%alloca1.sroa.7.0.insert.insert = or i64 %alloca1.sroa.8.0.insert.shift, %alloca1.sroa.7.0.insert.shift
%alloca1.sroa.6.0.insert.ext = zext i8 %load55 to i64
%alloca1.sroa.6.0.insert.shift = shl i64 %alloca1.sroa.6.0.insert.ext, 40
%alloca1.sroa.6.0.insert.insert = or i64 %alloca1.sroa.7.0.insert.insert, %alloca1.sroa.6.0.insert.shift
%alloca1.sroa.5.0.insert.ext = zext i8 %load49 to i64
%alloca1.sroa.5.0.insert.shift = shl i64 %alloca1.sroa.5.0.insert.ext, 32
%alloca1.sroa.5.0.insert.insert = or i64 %alloca1.sroa.6.0.insert.insert, %alloca1.sroa.5.0.insert.shift
%alloca1.sroa.4.0.insert.ext = zext i8 %load43 to i64
%alloca1.sroa.4.0.insert.shift = shl i64 %alloca1.sroa.4.0.insert.ext, 24
%alloca1.sroa.4.0.insert.insert = or i64 %alloca1.sroa.5.0.insert.insert, %alloca1.sroa.4.0.insert.shift
%alloca1.sroa.3.0.insert.ext = zext i8 %load37 to i64
%alloca1.sroa.3.0.insert.shift = shl i64 %alloca1.sroa.3.0.insert.ext, 16
%alloca1.sroa.2.0.insert.ext = zext i8 %load31 to i64
%alloca1.sroa.2.0.insert.shift = shl i64 %alloca1.sroa.2.0.insert.ext, 8
%alloca1.sroa.2.0.insert.mask = or i64 %alloca1.sroa.4.0.insert.insert, %alloca1.sroa.3.0.insert.shift
%alloca1.sroa.0.0.insert.ext = zext i8 %load28 to i64
%alloca1.sroa.0.0.insert.mask = or i64 %alloca1.sroa.2.0.insert.mask, %alloca1.sroa.2.0.insert.shift
%alloca1.sroa.0.0.insert.insert = or i64 %alloca1.sroa.0.0.insert.mask, %alloca1.sroa.0.0.insert.ext
%call = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.0.0.insert.insert, i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
%alloca1.sroa.17.8.insert.ext = zext i8 %load115 to i64
%alloca1.sroa.17.8.insert.shift = shl i64 %alloca1.sroa.17.8.insert.ext, 56
%alloca1.sroa.16.8.insert.ext = zext i8 %load109 to i64
%alloca1.sroa.16.8.insert.shift = shl i64 %alloca1.sroa.16.8.insert.ext, 48
%alloca1.sroa.16.8.insert.insert = or i64 %alloca1.sroa.17.8.insert.shift, %alloca1.sroa.16.8.insert.shift
%alloca1.sroa.15.8.insert.ext = zext i8 %load103 to i64
%alloca1.sroa.15.8.insert.shift = shl i64 %alloca1.sroa.15.8.insert.ext, 40
%alloca1.sroa.15.8.insert.insert = or i64 %alloca1.sroa.16.8.insert.insert, %alloca1.sroa.15.8.insert.shift
%alloca1.sroa.14.8.insert.ext = zext i8 %load97 to i64
%alloca1.sroa.14.8.insert.shift = shl i64 %alloca1.sroa.14.8.insert.ext, 32
%alloca1.sroa.14.8.insert.insert = or i64 %alloca1.sroa.15.8.insert.insert, %alloca1.sroa.14.8.insert.shift
%alloca1.sroa.13.8.insert.ext = zext i8 %load91 to i64
%alloca1.sroa.13.8.insert.shift = shl i64 %alloca1.sroa.13.8.insert.ext, 24
%alloca1.sroa.13.8.insert.insert = or i64 %alloca1.sroa.14.8.insert.insert, %alloca1.sroa.13.8.insert.shift
%alloca1.sroa.12.8.insert.ext = zext i8 %load85 to i64
%alloca1.sroa.12.8.insert.shift = shl i64 %alloca1.sroa.12.8.insert.ext, 16
%alloca1.sroa.11.8.insert.ext = zext i8 %load79 to i64
%alloca1.sroa.11.8.insert.shift = shl i64 %alloca1.sroa.11.8.insert.ext, 8
%alloca1.sroa.11.8.insert.mask = or i64 %alloca1.sroa.13.8.insert.insert, %alloca1.sroa.12.8.insert.shift
%alloca1.sroa.9.8.insert.ext = zext i8 %load73 to i64
%alloca1.sroa.9.8.insert.mask = or i64 %alloca1.sroa.11.8.insert.mask, %alloca1.sroa.11.8.insert.shift
%alloca1.sroa.9.8.insert.insert = or i64 %alloca1.sroa.9.8.insert.mask, %alloca1.sroa.9.8.insert.ext
%call225 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.9.8.insert.insert, i64 0, <16 x float> %call, i32 0, i32 0, i32 0)
%alloca1.sroa.26.16.insert.ext = zext i8 %load163 to i64
%alloca1.sroa.26.16.insert.shift = shl i64 %alloca1.sroa.26.16.insert.ext, 56
%alloca1.sroa.25.16.insert.ext = zext i8 %load157 to i64
%alloca1.sroa.25.16.insert.shift = shl i64 %alloca1.sroa.25.16.insert.ext, 48
%alloca1.sroa.25.16.insert.insert = or i64 %alloca1.sroa.26.16.insert.shift, %alloca1.sroa.25.16.insert.shift
%alloca1.sroa.24.16.insert.ext = zext i8 %load151 to i64
%alloca1.sroa.24.16.insert.shift = shl i64 %alloca1.sroa.24.16.insert.ext, 40
%alloca1.sroa.24.16.insert.insert = or i64 %alloca1.sroa.25.16.insert.insert, %alloca1.sroa.24.16.insert.shift
%alloca1.sroa.23.16.insert.ext = zext i8 %load145 to i64
%alloca1.sroa.23.16.insert.shift = shl i64 %alloca1.sroa.23.16.insert.ext, 32
%alloca1.sroa.23.16.insert.insert = or i64 %alloca1.sroa.24.16.insert.insert, %alloca1.sroa.23.16.insert.shift
%alloca1.sroa.22.16.insert.ext = zext i8 %load139 to i64
%alloca1.sroa.22.16.insert.shift = shl i64 %alloca1.sroa.22.16.insert.ext, 24
%alloca1.sroa.22.16.insert.insert = or i64 %alloca1.sroa.23.16.insert.insert, %alloca1.sroa.22.16.insert.shift
%alloca1.sroa.21.16.insert.ext = zext i8 %load133 to i64
%alloca1.sroa.21.16.insert.shift = shl i64 %alloca1.sroa.21.16.insert.ext, 16
%alloca1.sroa.20.16.insert.ext = zext i8 %load127 to i64
%alloca1.sroa.20.16.insert.shift = shl i64 %alloca1.sroa.20.16.insert.ext, 8
%alloca1.sroa.20.16.insert.mask = or i64 %alloca1.sroa.22.16.insert.insert, %alloca1.sroa.21.16.insert.shift
%alloca1.sroa.18.16.insert.ext = zext i8 %load121 to i64
%alloca1.sroa.18.16.insert.mask = or i64 %alloca1.sroa.20.16.insert.mask, %alloca1.sroa.20.16.insert.shift
%alloca1.sroa.18.16.insert.insert = or i64 %alloca1.sroa.18.16.insert.mask, %alloca1.sroa.18.16.insert.ext
%call230 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.18.16.insert.insert, i64 0, <16 x float> %call225, i32 0, i32 0, i32 0)
%alloca1.sroa.35.24.insert.ext = zext i8 %load211 to i64
%alloca1.sroa.35.24.insert.shift = shl i64 %alloca1.sroa.35.24.insert.ext, 56
%alloca1.sroa.34.24.insert.ext = zext i8 %load205 to i64
%alloca1.sroa.34.24.insert.shift = shl i64 %alloca1.sroa.34.24.insert.ext, 48
%alloca1.sroa.34.24.insert.insert = or i64 %alloca1.sroa.35.24.insert.shift, %alloca1.sroa.34.24.insert.shift
%alloca1.sroa.33.24.insert.ext = zext i8 %load199 to i64
%alloca1.sroa.33.24.insert.shift = shl i64 %alloca1.sroa.33.24.insert.ext, 40
%alloca1.sroa.33.24.insert.insert = or i64 %alloca1.sroa.34.24.insert.insert, %alloca1.sroa.33.24.insert.shift
%alloca1.sroa.32.24.insert.ext = zext i8 %load193 to i64
%alloca1.sroa.32.24.insert.shift = shl i64 %alloca1.sroa.32.24.insert.ext, 32
%alloca1.sroa.32.24.insert.insert = or i64 %alloca1.sroa.33.24.insert.insert, %alloca1.sroa.32.24.insert.shift
%alloca1.sroa.31.24.insert.ext = zext i8 %load187 to i64
%alloca1.sroa.31.24.insert.shift = shl i64 %alloca1.sroa.31.24.insert.ext, 24
%alloca1.sroa.31.24.insert.insert = or i64 %alloca1.sroa.32.24.insert.insert, %alloca1.sroa.31.24.insert.shift
%alloca1.sroa.30.24.insert.ext = zext i8 %load181 to i64
%alloca1.sroa.30.24.insert.shift = shl i64 %alloca1.sroa.30.24.insert.ext, 16
%alloca1.sroa.29.24.insert.ext = zext i8 %load175 to i64
%alloca1.sroa.29.24.insert.shift = shl i64 %alloca1.sroa.29.24.insert.ext, 8
%alloca1.sroa.29.24.insert.mask = or i64 %alloca1.sroa.31.24.insert.insert, %alloca1.sroa.30.24.insert.shift
%alloca1.sroa.27.24.insert.ext = zext i8 %load169 to i64
%alloca1.sroa.27.24.insert.mask = or i64 %alloca1.sroa.29.24.insert.mask, %alloca1.sroa.29.24.insert.shift
%alloca1.sroa.27.24.insert.insert = or i64 %alloca1.sroa.27.24.insert.mask, %alloca1.sroa.27.24.insert.ext
%call235 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.27.24.insert.insert, i64 0, <16 x float> %call230, i32 0, i32 0, i32 0)
ret <16 x float> %call235
}
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }

View File

@ -0,0 +1,2 @@
if not "AMDGPU" in config.root.targets:
config.unsupported = True