Ivan Kosarev 9e55d81c68
[AMDGPU][AsmParser] Introduce MC representation for lit() and lit64(). (#160316)
And rework the lit64() support to use it.

The rules for when to add lit64() can be simplified and
improved. In this change, however, we just follow the existing
conventions on the assembler and disassembler sides.

In codegen we do not (and normally should not need to) add explicit
lit() and lit64() modifiers, so the codegen tests lose them. The change
is an NFCI otherwise.

Simplifies printing operands.
2025-09-24 12:35:50 +01:00

1372 lines
59 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1250 -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-UNALIGNED %s
@lds = addrspace(3) global [512 x float] poison, align 4
@lds.f64 = addrspace(3) global [512 x double] poison, align 8
define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_one_val_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_one_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_one_val_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v1 offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%val = load float, ptr addrspace(1) %in.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
%val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_volatile_0:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: ds_write_b32 v0, v1 offset:32
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_b32 v0, v1
; GFX1250-NEXT: ds_store_b32 v0, v2 offset:32
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
%val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store volatile float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_volatile_1:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: ds_write_b32 v0, v1 offset:32
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_b32 v0, v1
; GFX1250-NEXT: ds_store_b32 v0, v2 offset:32
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
%val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store volatile float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
; 2 data subregisters from different super registers.
; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
; This should be an s_mov_b32. The v_mov_b32 gets introduced by an
; early legalization of the constant bus constraint on the v_lshl_add_u32,
; and then SIFoldOperands folds in an unlucky order.
define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: ; kill: killed $vgpr4
; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_subreg2_mixed_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:8 scale_offset scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 2, v4
; GFX1250-NEXT: ds_store_2addr_b32 v1, v0, v3 offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr <2 x float>, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile <2 x float>, ptr addrspace(1) %in.gep.0, align 8
%val1 = load volatile <2 x float>, ptr addrspace(1) %in.gep.1, align 8
%val0.0 = extractelement <2 x float> %val0, i32 0
%val1.1 = extractelement <2 x float> %val1, i32 1
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0.0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1.1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_subreg2_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_subreg2_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 2, v2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v2, v0, v1 offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
%val = load <2 x float>, ptr addrspace(1) %in.gep, align 8
%val0 = extractelement <2 x float> %val, i32 0
%val1 = extractelement <2 x float> %val, i32 1
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_subreg4_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_subreg4_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 2, v4
; GFX1250-NEXT: ds_store_2addr_b32 v1, v0, v3 offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %x.i
%val = load <4 x float>, ptr addrspace(1) %in.gep, align 16
%val0 = extractelement <4 x float> %val, i32 0
%val1 = extractelement <4 x float> %val, i32 3
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_max_offset_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_max_offset_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:255
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
%val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 255
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_too_far_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b32 v0, v1 offset:1028
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_too_far_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v2 offset:1028
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_too_far_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x1
; GFX1250-NEXT: ds_store_b32 v0, v1
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_b32 v0, v2 offset:1028
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 257
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val1, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_x2:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_f32_x2:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:8
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:11 offset1:27
; GFX1250-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%idx.0 = add nsw i32 %tid.x, 0
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%idx.1 = add nsw i32 %tid.x, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%idx.2 = add nsw i32 %tid.x, 11
%arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
store float %val0, ptr addrspace(3) %arrayidx2, align 4
%idx.3 = add nsw i32 %tid.x, 27
%arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
store float %val1, ptr addrspace(3) %arrayidx3, align 4
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:8
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:11 offset1:27
; GFX1250-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%idx.0 = add nsw i32 %tid.x, 3
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
store float %val0, ptr addrspace(3) %arrayidx0, align 4
%idx.1 = add nsw i32 %tid.x, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
store float %val1, ptr addrspace(3) %arrayidx1, align 4
%idx.2 = add nsw i32 %tid.x, 11
%arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
store float %val0, ptr addrspace(3) %arrayidx2, align 4
%idx.3 = add nsw i32 %tid.x, 27
%arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
store float %val1, ptr addrspace(3) %arrayidx3, align 4
ret void
}
define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 {
; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x6
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_mov_b32_e32 v1, s8
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: v_mov_b32_e32 v3, s9
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v1, v2 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b32 v3, v0 offset:32
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1 offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v3, v2 offset:32
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: write2_ptr_subreg_arg_two_val_f32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x18
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scale_offset
; GFX1250-NEXT: global_load_b32 v0, v0, s[2:3] scale_offset
; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX1250-NEXT: s_wait_loadcnt 0x1
; GFX1250-NEXT: ds_store_b32 v2, v1 offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_b32 v3, v0 offset:32
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
%in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
%val0 = load float, ptr addrspace(1) %in0.gep, align 4
%val1 = load float, ptr addrspace(1) %in1.gep, align 4
%index.0 = insertelement <2 x i32> poison, i32 %x.i, i32 0
%index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
%gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
%gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
%gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
; Apply an additional offset after the vector that will be more obviously folded.
%gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
store float %val0, ptr addrspace(3) %gep.0, align 4
%add.x = add nsw i32 %x.i, 8
store float %val1, ptr addrspace(3) %gep.1.offset, align 4
ret void
}
define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_one_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_one_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_one_val_f64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v2, 0x1ff8, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1]
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b64 v2, v[0:1], v[0:1] offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%val = load double, ptr addrspace(1) %in.gep, align 8
%arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
store double %val, ptr addrspace(3) %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
store double %val, ptr addrspace(3) %arrayidx1, align 8
ret void
}
define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
; CI-LABEL: misaligned_simple_write2_one_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v2, s2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: misaligned_simple_write2_one_val_f64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v2, 0x1ff8, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1]
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_add_nc_u32_e32 v2, s2, v2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b32 v2, v0, v1 offset1:1
; GFX1250-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:14 offset1:15
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%val = load double, ptr addrspace(1) %in.gep, align 8
%arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
store double %val, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 7
%arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
store double %val, ptr addrspace(3) %arrayidx1, align 4
ret void
}
define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:5
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:9
; CI-NEXT: ds_write_b8 v0, v2 offset:13
; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: ds_write_b8 v0, v3 offset:8
; CI-NEXT: ds_write_b8 v0, v4 offset:7
; CI-NEXT: ds_write_b8 v0, v5 offset:6
; CI-NEXT: ds_write_b8 v0, v3 offset:12
; CI-NEXT: ds_write_b8 v0, v4 offset:11
; CI-NEXT: ds_write_b8 v0, v5 offset:10
; CI-NEXT: ds_write_b8 v0, v1 offset:16
; CI-NEXT: ds_write_b8 v0, v6 offset:15
; CI-NEXT: ds_write_b8 v0, v2 offset:14
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s2, v2
; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16
; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s2, v2
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5
; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9
; GFX9-UNALIGNED-NEXT: s_endpgm
;
; GFX1250-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v2, 0x1ff8, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1]
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_add_nc_u32_e32 v2, s2, v2
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:5
; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:9
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%val = load double, ptr addrspace(1) %in.gep, align 8
%base = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
%addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
store double %val, ptr addrspace(3) %addr0.i8, align 1
%addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
store double %val, ptr addrspace(3) %addr1.i8, align 1
ret void
}
define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_two_val_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: simple_write2_two_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_two_val_f64:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_b32_e32 v4, 0x1ff8, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:8 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:8
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %x.i
%in.gep.1 = getelementptr double, ptr addrspace(1) %in.gep.0, i32 1
%val0 = load volatile double, ptr addrspace(1) %in.gep.0, align 8
%val1 = load volatile double, ptr addrspace(1) %in.gep.1, align 8
%arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
store double %val0, ptr addrspace(3) %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
store double %val1, ptr addrspace(3) %arrayidx1, align 8
ret void
}
@foo = addrspace(3) global [4 x i32] poison, align 4
define amdgpu_kernel void @store_constant_adjacent_offsets() {
; CI-LABEL: store_constant_adjacent_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b64 v2, v[0:1]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_constant_adjacent_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: store_constant_adjacent_offsets:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x7b0000007b
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: ds_store_b64 v2, v[0:1]
; GFX1250-NEXT: s_endpgm
store i32 123, ptr addrspace(3) @foo, align 4
store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
ret void
}
define amdgpu_kernel void @store_constant_disjoint_offsets() {
; CI-LABEL: store_constant_disjoint_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_constant_disjoint_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_write2_b32 v1, v0, v0 offset1:2
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: store_constant_disjoint_offsets:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1250-NEXT: ds_store_2addr_b32 v1, v0, v0 offset1:2
; GFX1250-NEXT: s_endpgm
store i32 123, ptr addrspace(3) @foo, align 4
store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
ret void
}
@bar = addrspace(3) global [4 x i64] poison, align 4
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; CI-LABEL: store_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, v0
; CI-NEXT: v_mov_b32_e32 v3, v1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b128 v1, v[0:3]
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_misaligned64_constant_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: ds_write_b128 v1, v[0:3]
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: store_misaligned64_constant_offsets:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
; GFX1250-NEXT: ds_store_b128 v1, v[0:3]
; GFX1250-NEXT: s_endpgm
store i64 123, ptr addrspace(3) @bar, align 4
store i64 123, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
ret void
}
@bar.large = addrspace(3) global [4096 x i64] poison, align 4
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_mov_b64 s[0:1], 0x7b
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: store_misaligned64_constant_large_offsets:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x7b
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:16384
; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:32760
; GFX1250-NEXT: s_endpgm
store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
ret void
}
@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] poison, align 4
@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] poison, align 4
define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 {
; CI-LABEL: write2_sgemm_sequence:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s0, s[0:1], 0x0
; CI-NEXT: s_lshl_b32 s1, s8, 2
; CI-NEXT: s_add_i32 s2, s1, 0xc20
; CI-NEXT: s_addk_i32 s1, 0xc60
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_mov_b32_e32 v3, s0
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: v_mov_b32_e32 v0, s1
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: write2_sgemm_sequence:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX9-NEXT: s_lshl_b32 s2, s8, 2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX9-NEXT: s_add_i32 s1, s2, 0xc20
; GFX9-NEXT: s_addk_i32 s2, 0xc60
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
; GFX9-NEXT: s_endpgm
;
; GFX1250-LABEL: write2_sgemm_sequence:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x10
; GFX1250-NEXT: s_and_b32 s2, ttmp6, 15
; GFX1250-NEXT: s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_bfe_u32 s1, ttmp6, 0x4000c
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_co_i32 s1, s1, 1
; GFX1250-NEXT: s_mul_i32 s1, ttmp9, s1
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_co_i32 s2, s2, s1
; GFX1250-NEXT: s_cmp_eq_u32 s3, 0
; GFX1250-NEXT: s_cselect_b32 s1, ttmp9, s2
; GFX1250-NEXT: s_lshl_b32 s1, s1, 2
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_co_i32 s2, s1, 0xc20
; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshrrev_b32 v0, 8, v0
; GFX1250-NEXT: s_addk_co_i32 s1, 0xc60
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v2, s0
; GFX1250-NEXT: v_mov_b32_e32 v3, s0
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX1250-NEXT: ds_store_2addr_b32 v1, v2, v3 offset1:1
; GFX1250-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
; GFX1250-NEXT: ds_store_2addr_b32 v0, v2, v3 offset1:1
; GFX1250-NEXT: ds_store_2addr_b32 v0, v2, v3 offset0:32 offset1:33
; GFX1250-NEXT: ds_store_2addr_b32 v0, v2, v3 offset0:64 offset1:65
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
%y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
%val = load float, ptr addrspace(1) %in
%arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
store float %val, ptr addrspace(3) %arrayidx44, align 4
%add47 = add nsw i32 %x.i, 1
%arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
store float %val, ptr addrspace(3) %arrayidx48, align 4
%add51 = add nsw i32 %x.i, 16
%arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
store float %val, ptr addrspace(3) %arrayidx52, align 4
%add55 = add nsw i32 %x.i, 17
%arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
store float %val, ptr addrspace(3) %arrayidx56, align 4
%arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
store float %val, ptr addrspace(3) %arrayidx60, align 4
%add63 = add nsw i32 %y.i, 1
%arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
store float %val, ptr addrspace(3) %arrayidx64, align 4
%add67 = add nsw i32 %y.i, 32
%arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
store float %val, ptr addrspace(3) %arrayidx68, align 4
%add71 = add nsw i32 %y.i, 33
%arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
store float %val, ptr addrspace(3) %arrayidx72, align 4
%add75 = add nsw i32 %y.i, 64
%arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
store float %val, ptr addrspace(3) %arrayidx76, align 4
%add79 = add nsw i32 %y.i, 65
%arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
store float %val, ptr addrspace(3) %arrayidx80, align 4
ret void
}
define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 {
; CI-LABEL: simple_write2_v4f32_superreg_align4:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s0
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; GFX9-ALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s8
; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s2
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, s3
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s8
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
;
; GFX1250-LABEL: simple_write2_v4f32_superreg_align4:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s4
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
; GFX1250-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1
; GFX1250-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:2 offset1:3
; GFX1250-NEXT: ds_store_2addr_b32 v0, v3, v4 offset1:1
; GFX1250-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %in
%val0 = load <4 x float>, ptr addrspace(1) %in.gep, align 4
%out.gep = getelementptr inbounds <4 x float>, ptr addrspace(3) %out, i32 %x.i
store <4 x float> %val0, ptr addrspace(3) %out.gep, align 4
ret void
}
@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] poison, align 1
define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
; CI-LABEL: write2_v2i32_align1_odd_offset:
; CI: ; %bb.0: ; %entry
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b8 v1, v0 offset:65
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: ds_write_b8 v1, v0 offset:70
; CI-NEXT: v_mov_b32_e32 v0, 0xc8
; CI-NEXT: ds_write_b8 v1, v0 offset:69
; CI-NEXT: ds_write_b8 v1, v1 offset:68
; CI-NEXT: ds_write_b8 v1, v1 offset:67
; CI-NEXT: ds_write_b8 v1, v1 offset:66
; CI-NEXT: ds_write_b8 v1, v1 offset:72
; CI-NEXT: ds_write_b8 v1, v1 offset:71
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
; GFX9-ALIGNED: ; %bb.0: ; %entry
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
; GFX9-UNALIGNED: ; %bb.0: ; %entry
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65
; GFX9-UNALIGNED-NEXT: s_endpgm
;
; GFX1250-LABEL: write2_v2i32_align1_odd_offset:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x1c80000007b
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:65
; GFX1250-NEXT: s_endpgm
entry:
store <2 x i32> <i32 123, i32 456>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
ret void
}
declare i32 @llvm.amdgcn.workgroup.id.x() #1
declare i32 @llvm.amdgcn.workgroup.id.y() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { convergent nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX1250-UNALIGNED: {{.*}}