Use the default, which freely coalesces anything it can. This mostly shows improvements, with a handful of regressions. The main concern would be if introducing wider registers is more likely to push the register usage up to the next occupancy tier.
1039 lines
48 KiB
LLVM
1039 lines
48 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=SPLIT %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefix=SPLIT %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefix=SPLIT %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefix=ALIGNED-GFX11 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefix=UNALIGNED-GFX11 %s
|
|
|
|
define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
|
|
; SPLIT-LABEL: test_local_misaligned_v2:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; SPLIT-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: ds_write2_b32 v2, v1, v0 offset1:1
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_local_misaligned_v2:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; ALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: ds_write2_b32 v2, v1, v0 offset1:1
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_local_misaligned_v2:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: ds_write2_b32 v2, v1, v0 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_local_misaligned_v2:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v1, v0 offset1:1
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_local_misaligned_v2:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v1, v0 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
|
|
%load = load <2 x i32>, ptr addrspace(3) %gep, align 4
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> poison, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
|
|
; SPLIT-LABEL: test_local_misaligned_v4:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0
|
|
; SPLIT-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:3
|
|
; SPLIT-NEXT: ds_read2_b32 v[2:3], v4 offset1:1
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SPLIT-NEXT: ds_write2_b32 v4, v1, v0 offset1:1
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SPLIT-NEXT: ds_write2_b32 v4, v3, v2 offset0:2 offset1:3
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_local_misaligned_v4:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0
|
|
; ALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v4 offset1:1
|
|
; ALIGNED-GFX10-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1)
|
|
; ALIGNED-GFX10-NEXT: ds_write2_b32 v4, v1, v0 offset0:2 offset1:3
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1)
|
|
; ALIGNED-GFX10-NEXT: ds_write2_b32 v4, v3, v2 offset1:1
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_local_misaligned_v4:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0
|
|
; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:3
|
|
; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[2:3], v4 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1)
|
|
; UNALIGNED-GFX10-NEXT: ds_write2_b32 v4, v1, v0 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(1)
|
|
; UNALIGNED-GFX10-NEXT: ds_write2_b32 v4, v3, v2 offset0:2 offset1:3
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_local_misaligned_v4:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0
|
|
; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v4 offset1:1
|
|
; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[2:3], v4 offset0:2 offset1:3
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1)
|
|
; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v1, v0 offset0:2 offset1:3
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1)
|
|
; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v3, v2 offset1:1
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_local_misaligned_v4:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0
|
|
; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v4 offset0:2 offset1:3
|
|
; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[2:3], v4 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1)
|
|
; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v1, v0 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(1)
|
|
; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v4, v3, v2 offset0:2 offset1:3
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
|
|
%load = load <4 x i32>, ptr addrspace(3) %gep, align 4
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> poison, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
|
|
; SPLIT-LABEL: test_local_misaligned_v3:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; SPLIT-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; SPLIT-NEXT: ds_read_b32 v3, v2 offset:8
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: ds_write2_b32 v2, v3, v0 offset1:1
|
|
; SPLIT-NEXT: ds_write_b32 v2, v1 offset:8
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_local_misaligned_v3:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; ALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; ALIGNED-GFX10-NEXT: ds_read_b32 v3, v2 offset:8
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: ds_write2_b32 v2, v3, v0 offset1:1
|
|
; ALIGNED-GFX10-NEXT: ds_write_b32 v2, v1 offset:8
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_local_misaligned_v3:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; UNALIGNED-GFX10-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: ds_read_b32 v3, v2 offset:8
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: ds_write2_b32 v2, v3, v0 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: ds_write_b32 v2, v1 offset:8
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_local_misaligned_v3:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; ALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
|
|
; ALIGNED-GFX11-NEXT: ds_load_b32 v3, v2 offset:8
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v3, v0 offset1:1
|
|
; ALIGNED-GFX11-NEXT: ds_store_b32 v2, v1 offset:8
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_local_misaligned_v3:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v2, v0, 2, s0
|
|
; UNALIGNED-GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: ds_load_b32 v3, v2 offset:8
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: ds_store_2addr_b32 v2, v3, v0 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: ds_store_b32 v2, v1 offset:8
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
|
|
%load = load <3 x i32>, ptr addrspace(3) %gep, align 4
|
|
%v1 = extractelement <3 x i32> %load, i32 0
|
|
%v2 = extractelement <3 x i32> %load, i32 1
|
|
%v3 = extractelement <3 x i32> %load, i32 2
|
|
%v5 = insertelement <3 x i32> poison, i32 %v3, i32 0
|
|
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
|
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
|
store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
|
|
; SPLIT-LABEL: test_flat_misaligned_v2:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
|
|
; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 4
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; SPLIT-NEXT: s_clause 0x1
|
|
; SPLIT-NEXT: flat_load_dword v4, v[2:3]
|
|
; SPLIT-NEXT: flat_load_dword v5, v[0:1]
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
|
|
; SPLIT-NEXT: flat_store_dword v[0:1], v4
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
|
|
; SPLIT-NEXT: flat_store_dword v[2:3], v5
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_flat_misaligned_v2:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
|
|
; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4]
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
|
|
; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4]
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0
|
|
; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4]
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0
|
|
; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4]
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr %arg, i32 %lid
|
|
%load = load <2 x i32>, ptr %gep, align 4
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> poison, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, ptr %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
|
|
; SPLIT-LABEL: test_flat_misaligned_v4:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
|
|
; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 12
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; SPLIT-NEXT: v_add_co_u32 v4, vcc_lo, v0, 4
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
|
|
; SPLIT-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
|
|
; SPLIT-NEXT: s_clause 0x3
|
|
; SPLIT-NEXT: flat_load_dword v8, v[2:3]
|
|
; SPLIT-NEXT: flat_load_dword v9, v[4:5]
|
|
; SPLIT-NEXT: flat_load_dword v10, v[0:1]
|
|
; SPLIT-NEXT: flat_load_dword v11, v[6:7]
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
|
|
; SPLIT-NEXT: flat_store_dword v[6:7], v9
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
|
|
; SPLIT-NEXT: flat_store_dword v[2:3], v10
|
|
; SPLIT-NEXT: flat_store_dword v[0:1], v8
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
|
|
; SPLIT-NEXT: flat_store_dword v[4:5], v11
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_flat_misaligned_v4:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v4:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_flat_misaligned_v4:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0
|
|
; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8]
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0
|
|
; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v4:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0
|
|
; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8]
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0
|
|
; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr %arg, i32 %lid
|
|
%load = load <4 x i32>, ptr %gep, align 4
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> poison, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, ptr %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
|
|
; SPLIT-LABEL: test_flat_misaligned_v3:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
|
|
; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 4
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
|
|
; SPLIT-NEXT: v_add_co_u32 v4, vcc_lo, v0, 8
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
|
|
; SPLIT-NEXT: s_clause 0x2
|
|
; SPLIT-NEXT: flat_load_dword v6, v[2:3]
|
|
; SPLIT-NEXT: flat_load_dword v7, v[4:5]
|
|
; SPLIT-NEXT: flat_load_dword v8, v[0:1]
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
|
|
; SPLIT-NEXT: flat_store_dword v[4:5], v6
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
|
|
; SPLIT-NEXT: flat_store_dword v[0:1], v7
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
|
|
; SPLIT-NEXT: flat_store_dword v[2:3], v8
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_flat_misaligned_v3:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0
|
|
; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0
|
|
; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6]
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1
|
|
; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0
|
|
; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6]
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1
|
|
; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0
|
|
; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6]
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
|
|
; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0
|
|
; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6]
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
|
|
; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr %arg, i32 %lid
|
|
%load = load <3 x i32>, ptr %gep, align 4
|
|
%v1 = extractelement <3 x i32> %load, i32 0
|
|
%v2 = extractelement <3 x i32> %load, i32 1
|
|
%v3 = extractelement <3 x i32> %load, i32 2
|
|
%v5 = insertelement <3 x i32> poison, i32 %v3, i32 0
|
|
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
|
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
|
store <3 x i32> %v7, ptr %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
|
|
; SPLIT-LABEL: test_local_aligned_v2:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_lshl_add_u32 v3, v0, 2, s0
|
|
; SPLIT-NEXT: ds_read_b64 v[0:1], v3
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v2, v0
|
|
; SPLIT-NEXT: ds_write_b64 v3, v[1:2]
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_local_aligned_v2:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v3, v0, 2, s0
|
|
; ALIGNED-GFX10-NEXT: ds_read_b64 v[0:1], v3
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX10-NEXT: ds_write_b64 v3, v[1:2]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_local_aligned_v2:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v3, v0, 2, s0
|
|
; UNALIGNED-GFX10-NEXT: ds_read_b64 v[0:1], v3
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX10-NEXT: ds_write_b64 v3, v[1:2]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_local_aligned_v2:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v3, v0, 2, s0
|
|
; ALIGNED-GFX11-NEXT: ds_load_b64 v[0:1], v3
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX11-NEXT: ds_store_b64 v3, v[1:2]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_local_aligned_v2:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v3, v0, 2, s0
|
|
; UNALIGNED-GFX11-NEXT: ds_load_b64 v[0:1], v3
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX11-NEXT: ds_store_b64 v3, v[1:2]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
|
|
%load = load <2 x i32>, ptr addrspace(3) %gep, align 8
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> poison, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
|
|
; SPLIT-LABEL: test_local_aligned_v3:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; SPLIT-NEXT: ds_read_b96 v[0:2], v5
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v3, v0
|
|
; SPLIT-NEXT: v_mov_b32_e32 v4, v1
|
|
; SPLIT-NEXT: ds_write_b96 v5, v[2:4]
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_local_aligned_v3:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1
|
|
; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_local_aligned_v3:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1
|
|
; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_local_aligned_v3:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
|
|
; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_local_aligned_v3:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
|
|
; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
|
|
%load = load <3 x i32>, ptr addrspace(3) %gep, align 16
|
|
%v1 = extractelement <3 x i32> %load, i32 0
|
|
%v2 = extractelement <3 x i32> %load, i32 1
|
|
%v3 = extractelement <3 x i32> %load, i32 2
|
|
%v5 = insertelement <3 x i32> poison, i32 %v3, i32 0
|
|
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
|
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
|
store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
|
|
; SPLIT-LABEL: test_flat_aligned_v2:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
|
|
; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[3:4]
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v2, v0
|
|
; SPLIT-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_flat_aligned_v2:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
|
|
; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4]
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
|
|
; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4]
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_flat_aligned_v2:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0
|
|
; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4]
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0
|
|
; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4]
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr %arg, i32 %lid
|
|
%load = load <2 x i32>, ptr %gep, align 8
|
|
%v1 = extractelement <2 x i32> %load, i32 0
|
|
%v2 = extractelement <2 x i32> %load, i32 1
|
|
%v3 = insertelement <2 x i32> poison, i32 %v2, i32 0
|
|
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
|
store <2 x i32> %v4, ptr %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) {
|
|
; SPLIT-LABEL: test_flat_aligned_v4:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; SPLIT-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v4, v2
|
|
; SPLIT-NEXT: v_mov_b32_e32 v5, v1
|
|
; SPLIT-NEXT: v_mov_b32_e32 v6, v0
|
|
; SPLIT-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_flat_aligned_v4:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_flat_aligned_v4:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_flat_aligned_v4:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0
|
|
; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8]
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0
|
|
; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_flat_aligned_v4:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0
|
|
; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8]
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0
|
|
; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr %arg, i32 %lid
|
|
%load = load <4 x i32>, ptr %gep, align 16
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> poison, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, ptr %gep, align 16
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
|
|
; SPLIT-LABEL: test_local_v4_aligned8:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_lshl_add_u32 v6, v0, 2, s0
|
|
; SPLIT-NEXT: ds_read2_b64 v[1:4], v6 offset1:1
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v0, v2
|
|
; SPLIT-NEXT: v_mov_b32_e32 v5, v3
|
|
; SPLIT-NEXT: ds_write2_b64 v6, v[4:5], v[0:1] offset1:1
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_local_v4_aligned8:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; ALIGNED-GFX10-NEXT: ds_read2_b64 v[0:3], v5 offset1:1
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX10-NEXT: ds_write2_b64 v5, v[3:4], v[1:2] offset1:1
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_local_v4_aligned8:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; UNALIGNED-GFX10-NEXT: ds_read2_b64 v[0:3], v5 offset1:1
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX10-NEXT: ds_write2_b64 v5, v[3:4], v[1:2] offset1:1
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_local_v4_aligned8:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; ALIGNED-GFX11-NEXT: ds_load_2addr_b64 v[0:3], v5 offset1:1
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; ALIGNED-GFX11-NEXT: ds_store_2addr_b64 v5, v[3:4], v[1:2] offset1:1
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_local_v4_aligned8:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0
|
|
; UNALIGNED-GFX11-NEXT: ds_load_2addr_b64 v[0:3], v5 offset1:1
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0
|
|
; UNALIGNED-GFX11-NEXT: ds_store_2addr_b64 v5, v[3:4], v[1:2] offset1:1
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
|
|
%load = load <4 x i32>, ptr addrspace(3) %gep, align 8
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> poison, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
|
|
; SPLIT-LABEL: test_flat_v4_aligned8:
|
|
; SPLIT: ; %bb.0: ; %bb
|
|
; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; SPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SPLIT-NEXT: v_add_co_u32 v6, s0, s0, v0
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e64 v7, s0, s1, 0, s0
|
|
; SPLIT-NEXT: v_add_co_u32 v8, vcc_lo, v6, 8
|
|
; SPLIT-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo
|
|
; SPLIT-NEXT: s_clause 0x1
|
|
; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[6:7]
|
|
; SPLIT-NEXT: flat_load_dwordx2 v[3:4], v[8:9]
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v2, v0
|
|
; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SPLIT-NEXT: v_mov_b32_e32 v5, v3
|
|
; SPLIT-NEXT: flat_store_dwordx2 v[8:9], v[1:2]
|
|
; SPLIT-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
|
|
; SPLIT-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8:
|
|
; ALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; ALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; ALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; ALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX10-LABEL: test_flat_v4_aligned8:
|
|
; UNALIGNED-GFX10: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v8, s0, s1, 0, s0
|
|
; UNALIGNED-GFX10-NEXT: flat_load_dwordx4 v[0:3], v[7:8]
|
|
; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; UNALIGNED-GFX10-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
|
|
; UNALIGNED-GFX10-NEXT: s_endpgm
|
|
;
|
|
; ALIGNED-GFX11-LABEL: test_flat_v4_aligned8:
|
|
; ALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; ALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0
|
|
; ALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8]
|
|
; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1
|
|
; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0
|
|
; ALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6]
|
|
; ALIGNED-GFX11-NEXT: s_endpgm
|
|
;
|
|
; UNALIGNED-GFX11-LABEL: test_flat_v4_aligned8:
|
|
; UNALIGNED-GFX11: ; %bb.0: ; %bb
|
|
; UNALIGNED-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
|
; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_u32 v7, s0, s0, v0
|
|
; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s1, 0, s0
|
|
; UNALIGNED-GFX11-NEXT: flat_load_b128 v[0:3], v[7:8]
|
|
; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1
|
|
; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v6, v0
|
|
; UNALIGNED-GFX11-NEXT: flat_store_b128 v[7:8], v[3:6]
|
|
; UNALIGNED-GFX11-NEXT: s_endpgm
|
|
bb:
|
|
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds i32, ptr %arg, i32 %lid
|
|
%load = load <4 x i32>, ptr %gep, align 8
|
|
%v1 = extractelement <4 x i32> %load, i32 0
|
|
%v2 = extractelement <4 x i32> %load, i32 1
|
|
%v3 = extractelement <4 x i32> %load, i32 2
|
|
%v4 = extractelement <4 x i32> %load, i32 3
|
|
%v5 = insertelement <4 x i32> poison, i32 %v4, i32 0
|
|
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
|
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
|
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
|
store <4 x i32> %v8, ptr %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|