Min-Yih Hsu 7ebbbd885f
[DAG] Always use stack to promote bitcast when the source is vector (#151065)
The optimization introduced by #125637 tried to avoid using stacks to
promote bitcast with vector result type. However, it wouldn't be correct
if the input type is vector. This patch limits that optimizations to
only scalar to vector bitcasts.
2025-08-02 15:32:10 -07:00

6190 lines
226 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn | FileCheck -check-prefixes=SI %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=VI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=GFX9 %s
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck -check-prefixes=EGCM,EG %s
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck -check-prefixes=EGCM,CM %s
define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
; SI-LABEL: i8_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s2, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i8_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i8_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i8_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i8_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = zext i8 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind {
; SI-LABEL: i8_zext_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s2, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i8_zext_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i8_zext_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i8_zext_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
;
; CM-LABEL: i8_zext_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = zext i8 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind {
; SI-LABEL: i8_sext_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sext_i32_i8 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i8_sext_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i8_sext_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i8 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i8_sext_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
;
; CM-LABEL: i8_sext_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = sext i8 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind {
; SI-LABEL: i16_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s2, 0xffff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i16_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i16_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i16_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i16_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = zext i16 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind {
; SI-LABEL: i16_zext_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s2, 0xffff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i16_zext_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i16_zext_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i16_zext_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; CM-LABEL: i16_zext_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = zext i16 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind {
; SI-LABEL: i16_sext_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sext_i32_i16 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i16_sext_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i16_sext_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i16_sext_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; CM-LABEL: i16_sext_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = sext i16 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind {
; SI-LABEL: i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T1.X, KC0[2].Z,
entry:
store i32 %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind {
; SI-LABEL: f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T1.X, KC0[2].Z,
entry:
store float %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
; SI-LABEL: v2i8_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v2i8_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2i8_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v2i8_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v2i8_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T0.Y, 0.0,
; CM-NEXT: MOV * T0.Z, 0.0,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <2 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
; SI-LABEL: v2i16_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v2i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2i16_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v2i16_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV T0.X, KC0[2].Z,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v2i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <2 x i16> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind {
; SI-LABEL: v2i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v2i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v2i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].X,
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v2i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.Y, KC0[3].X,
; CM-NEXT: MOV * T0.X, KC0[2].W,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <2 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind {
; SI-LABEL: v2f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v2f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v2f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].X,
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v2f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.Y, KC0[3].X,
; CM-NEXT: MOV * T0.X, KC0[2].W,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <2 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
; SI-LABEL: v3i8_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v3i8_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_add_u32 s0, s0, 2
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v3i8_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v3i8_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @12, KC0[], KC1[]
; EG-NEXT: TEX 2 @6
; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X
; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3
; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T4.X, 0.0,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: LSHL T0.W, T5.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PS, literal.x,
; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T4.X, PV.W, PS,
; EG-NEXT: LSHL * T4.W, literal.x, PS,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T4.Y, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T6.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T5.X, T2.W, PV.W,
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV T4.Z, 0.0,
; EG-NEXT: MOV * T5.Z, 0.0,
; EG-NEXT: LSHR T6.X, T0.W, literal.x,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v3i8_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @12, KC0[], KC1[]
; CM-NEXT: TEX 2 @6
; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X
; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3
; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T4.X, 0.0,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T4.Y, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T6.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: MOV * T4.Z, 0.0,
; CM-NEXT: MOV * T5.Z, 0.0,
; CM-NEXT: LSHR * T6.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <3 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
; SI-LABEL: v3i16_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v3i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 4
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: flat_store_short v[2:3], v4
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v3i16_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v3i16_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @12, KC0[], KC1[]
; EG-NEXT: TEX 2 @6
; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T5.X, T2.W, PV.W,
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV * T5.Z, 0.0,
; EG-NEXT: LSHR T8.X, T0.W, literal.x,
; EG-NEXT: LSHL T0.W, T7.X, literal.y,
; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT T6.X, PV.W, PS,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v3i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @12, KC0[], KC1[]
; CM-NEXT: TEX 2 @6
; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T5.X, 0.0,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: MOV * T5.Z, 0.0,
; CM-NEXT: LSHL T0.Z, T7.X, literal.x,
; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
; SI-LABEL: v3i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v3i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v3i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v3i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].Z,
; EG-NEXT: MOV T0.X, KC0[3].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T2.X, PV.W, literal.x,
; EG-NEXT: MOV * T3.X, KC0[3].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v3i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T1.X, KC0[3].W,
; CM-NEXT: MOV * T2.Y, KC0[3].Z,
; CM-NEXT: MOV * T2.X, KC0[3].Y,
; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
; SI-LABEL: v3f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v3f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v3f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v3f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].Z,
; EG-NEXT: MOV T0.X, KC0[3].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T2.X, PV.W, literal.x,
; EG-NEXT: MOV * T3.X, KC0[3].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v3f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T1.X, KC0[3].W,
; CM-NEXT: MOV * T2.Y, KC0[3].Z,
; CM-NEXT: MOV * T2.X, KC0[3].Y,
; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
; SI-LABEL: v4i8_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v4i8_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i8_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v4i8_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV T0.X, KC0[2].Z,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v4i8_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <4 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
; SI-LABEL: v4i16_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v4i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i16_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v4i16_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].X,
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v4i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.Y, KC0[3].X,
; CM-NEXT: MOV * T0.X, KC0[2].W,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <4 x i16> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind {
; SI-LABEL: v4i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v4i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v4i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, KC0[4].X,
; EG-NEXT: MOV * T0.Z, KC0[3].W,
; EG-NEXT: MOV * T0.Y, KC0[3].Z,
; EG-NEXT: MOV T0.X, KC0[3].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v4i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.W, KC0[4].X,
; CM-NEXT: MOV * T0.Z, KC0[3].W,
; CM-NEXT: MOV * T0.Y, KC0[3].Z,
; CM-NEXT: MOV * T0.X, KC0[3].Y,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <4 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind {
; SI-LABEL: v4f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v4f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v4f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, KC0[4].X,
; EG-NEXT: MOV * T0.Z, KC0[3].W,
; EG-NEXT: MOV * T0.Y, KC0[3].Z,
; EG-NEXT: MOV T0.X, KC0[3].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v4f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.W, KC0[4].X,
; CM-NEXT: MOV * T0.Z, KC0[3].W,
; CM-NEXT: MOV * T0.Y, KC0[3].Z,
; CM-NEXT: MOV * T0.X, KC0[3].Y,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <4 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
; SI-LABEL: v5i8_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v5i8_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 4
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: flat_store_byte v[2:3], v4
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5i8_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5i8_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @16, KC0[], KC1[]
; EG-NEXT: TEX 4 @6
; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 17:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T5.X, T2.W, PV.W,
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV T5.Z, 0.0,
; EG-NEXT: AND_INT T1.W, T9.X, literal.x,
; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PV.W, literal.x,
; EG-NEXT: LSHL * T2.W, T7.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
; EG-NEXT: OR_INT T1.W, PS, PV.W,
; EG-NEXT: LSHL * T2.W, T0.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T1.W, PV.W, PS,
; EG-NEXT: AND_INT * T2.W, T6.X, literal.x,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: OR_INT T6.X, PV.W, PS,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5i8_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @16, KC0[], KC1[]
; CM-NEXT: TEX 4 @6
; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
; CM-NEXT: ALU clause starting at 16:
; CM-NEXT: MOV * T5.X, 0.0,
; CM-NEXT: ALU clause starting at 17:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: MOV T5.Z, 0.0,
; CM-NEXT: AND_INT * T1.W, T9.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Y, T8.X, literal.x,
; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
; CM-NEXT: LSHL * T1.W, PV.Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T7.X, T0.W, literal.x,
; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43)
; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind {
; SI-LABEL: v5i16_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s6, s[4:5], 0xf
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v5i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s6, s[4:5], 0x3c
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 8
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: flat_store_short v[2:3], v4
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5i16_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5i16_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @20, KC0[], KC1[]
; EG-NEXT: TEX 4 @10
; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X
; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X
; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X
; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X
; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3
; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3
; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3
; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 21:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T5.X, T2.W, PV.W,
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T2.W, T4.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T4.X, T2.W, PV.W,
; EG-NEXT: LSHL * T4.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T4.Y, 0.0,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T3.W, T3.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T2.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T3.X, T3.W, PV.W,
; EG-NEXT: LSHL * T3.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T3.Y, 0.0,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T6.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T7.W, T2.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T6.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T6.X, T7.W, PV.W,
; EG-NEXT: LSHL * T6.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T6.Y, 0.0,
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T8.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T9.W, T1.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T8.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T8.X, T9.W, PV.W,
; EG-NEXT: LSHL * T8.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T8.Y, 0.0,
; EG-NEXT: MOV T5.Z, 0.0,
; EG-NEXT: MOV * T4.Z, 0.0,
; EG-NEXT: MOV T3.Z, 0.0,
; EG-NEXT: MOV * T6.Z, 0.0,
; EG-NEXT: MOV * T8.Z, 0.0,
; EG-NEXT: LSHR T0.X, T7.W, literal.x,
; EG-NEXT: LSHR * T1.X, T2.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: LSHR T2.X, T1.W, literal.x,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: LSHR * T9.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @20, KC0[], KC1[]
; CM-NEXT: TEX 4 @10
; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X
; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X
; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X
; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X
; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 10:
; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3
; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3
; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3
; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
; CM-NEXT: ALU clause starting at 20:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 21:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T4.Y, 0.0,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T2.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T3.X, literal.x,
; CM-NEXT: LSHL * T2.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T3.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T3.Y, 0.0,
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T6.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T2.X, literal.x,
; CM-NEXT: LSHL * T6.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T6.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T6.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T6.Y, 0.0,
; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T8.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T1.X, literal.x,
; CM-NEXT: LSHL * T8.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T8.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T8.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T8.Y, 0.0,
; CM-NEXT: MOV * T5.Z, 0.0,
; CM-NEXT: MOV * T4.Z, 0.0,
; CM-NEXT: MOV * T3.Z, 0.0,
; CM-NEXT: MOV * T6.Z, 0.0,
; CM-NEXT: MOV * T8.Z, 0.0,
; CM-NEXT: LSHR * T0.X, T7.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, T2.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T2.X, T1.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T9.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind {
; SI-LABEL: v5i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s8, s[4:5], 0x15
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v5i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VI-NEXT: s_load_dword s8, s[4:5], 0x54
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s6, 16
; VI-NEXT: s_addc_u32 s5, s7, 0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, KC0[5].X,
; EG-NEXT: MOV * T0.Z, KC0[4].W,
; EG-NEXT: MOV * T0.Y, KC0[4].Z,
; EG-NEXT: MOV T0.X, KC0[4].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T2.X, PV.W, literal.x,
; EG-NEXT: MOV * T3.X, KC0[5].Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T0.W, KC0[5].X,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T1.X, PV.Z, literal.x,
; CM-NEXT: MOV * T0.Z, KC0[4].W,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T2.X, KC0[5].Y,
; CM-NEXT: MOV * T0.Y, KC0[4].Z,
; CM-NEXT: MOV * T0.X, KC0[4].Y,
; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind {
; SI-LABEL: v5f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s8, s[4:5], 0x15
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v5f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VI-NEXT: s_load_dword s8, s[4:5], 0x54
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s6, 16
; VI-NEXT: s_addc_u32 s5, s7, 0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v3, s8
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: flat_store_dword v[1:2], v3
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, KC0[5].X,
; EG-NEXT: MOV * T0.Z, KC0[4].W,
; EG-NEXT: MOV * T0.Y, KC0[4].Z,
; EG-NEXT: MOV T0.X, KC0[4].Y,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T2.X, PV.W, literal.x,
; EG-NEXT: MOV * T3.X, KC0[5].Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T0.W, KC0[5].X,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T1.X, PV.Z, literal.x,
; CM-NEXT: MOV * T0.Z, KC0[4].W,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T2.X, KC0[5].Y,
; CM-NEXT: MOV * T0.Y, KC0[4].Z,
; CM-NEXT: MOV * T0.X, KC0[4].Y,
; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind {
; SI-LABEL: v5i64_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
; SI-NEXT: s_endpgm
;
; VI-LABEL: v5i64_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s12, s8, 32
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_addc_u32 s13, s9, 0
; VI-NEXT: v_mov_b32_e32 v3, s12
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: s_add_u32 s4, s8, 16
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s5, s9, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5i64_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s10
; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5i64_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MOV * T0.W, KC0[7].X,
; EG-NEXT: MOV * T0.Z, KC0[6].W,
; EG-NEXT: MOV T0.Y, KC0[6].Z,
; EG-NEXT: MOV * T1.W, KC0[8].X,
; EG-NEXT: MOV T0.X, KC0[6].Y,
; EG-NEXT: MOV * T1.Z, KC0[7].W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.Y, KC0[7].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T1.X, KC0[7].Y,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T3.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T4.X, PV.W, literal.x,
; EG-NEXT: MOV T5.Y, KC0[8].Z,
; EG-NEXT: MOV * T5.X, KC0[8].Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5i64_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MOV * T0.W, KC0[8].X,
; CM-NEXT: MOV T1.Y, KC0[8].Z,
; CM-NEXT: MOV * T0.Z, KC0[7].W,
; CM-NEXT: MOV T1.X, KC0[8].Y,
; CM-NEXT: MOV * T0.Y, KC0[7].Z,
; CM-NEXT: MOV T0.X, KC0[7].Y,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T2.W, KC0[7].X,
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T3.X, PV.Z, literal.x,
; CM-NEXT: MOV T2.Z, KC0[6].W,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T4.X, PV.W, literal.x,
; CM-NEXT: MOV * T2.Y, KC0[6].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T2.X, KC0[6].Y,
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x i64> %in, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
; SI-LABEL: v5f64_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
; SI-NEXT: s_endpgm
;
; VI-LABEL: v5f64_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s12, s8, 32
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_addc_u32 s13, s9, 0
; VI-NEXT: v_mov_b32_e32 v3, s12
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: s_add_u32 s4, s8, 16
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s5, s9, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v5f64_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s10
; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v5f64_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MOV * T0.W, KC0[7].X,
; EG-NEXT: MOV * T0.Z, KC0[6].W,
; EG-NEXT: MOV T0.Y, KC0[6].Z,
; EG-NEXT: MOV * T1.W, KC0[8].X,
; EG-NEXT: MOV T0.X, KC0[6].Y,
; EG-NEXT: MOV * T1.Z, KC0[7].W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.Y, KC0[7].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T1.X, KC0[7].Y,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T3.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T4.X, PV.W, literal.x,
; EG-NEXT: MOV T5.Y, KC0[8].Z,
; EG-NEXT: MOV * T5.X, KC0[8].Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5f64_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MOV * T0.W, KC0[8].X,
; CM-NEXT: MOV T1.Y, KC0[8].Z,
; CM-NEXT: MOV * T0.Z, KC0[7].W,
; CM-NEXT: MOV T1.X, KC0[8].Y,
; CM-NEXT: MOV * T0.Y, KC0[7].Z,
; CM-NEXT: MOV T0.X, KC0[7].Y,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T2.W, KC0[7].X,
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T3.X, PV.Z, literal.x,
; CM-NEXT: MOV T2.Z, KC0[6].W,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T4.X, PV.W, literal.x,
; CM-NEXT: MOV * T2.Y, KC0[6].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T2.X, KC0[6].Y,
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
}
; FIXME: Lots of unpack and re-pack junk on VI
define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; SI-LABEL: v8i8_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v8i8_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v8i8_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v8i8_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @36, KC0[], KC1[]
; EG-NEXT: TEX 0 @20
; EG-NEXT: ALU 5, @38, KC0[], KC1[]
; EG-NEXT: TEX 0 @22
; EG-NEXT: ALU 5, @44, KC0[], KC1[]
; EG-NEXT: TEX 0 @24
; EG-NEXT: ALU 7, @50, KC0[], KC1[]
; EG-NEXT: TEX 0 @26
; EG-NEXT: ALU 7, @58, KC0[], KC1[]
; EG-NEXT: TEX 0 @28
; EG-NEXT: ALU 7, @66, KC0[], KC1[]
; EG-NEXT: TEX 0 @30
; EG-NEXT: ALU 7, @74, KC0[], KC1[]
; EG-NEXT: TEX 0 @32
; EG-NEXT: ALU 5, @82, KC0[], KC1[]
; EG-NEXT: TEX 0 @34
; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 20:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
; EG-NEXT: Fetch clause starting at 24:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
; EG-NEXT: Fetch clause starting at 26:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
; EG-NEXT: Fetch clause starting at 28:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
; EG-NEXT: Fetch clause starting at 30:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
; EG-NEXT: Fetch clause starting at 32:
; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
; EG-NEXT: Fetch clause starting at 34:
; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
; EG-NEXT: ALU clause starting at 36:
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: MOV * T5.X, 0.0,
; EG-NEXT: ALU clause starting at 38:
; EG-NEXT: LSHL T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 44:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 50:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 58:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 66:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 74:
; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 82:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T5.Y, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 88:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT T5.X, PV.W, PS,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v8i8_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @36, KC0[], KC1[]
; CM-NEXT: TEX 0 @20
; CM-NEXT: ALU 5, @38, KC0[], KC1[]
; CM-NEXT: TEX 0 @22
; CM-NEXT: ALU 5, @44, KC0[], KC1[]
; CM-NEXT: TEX 0 @24
; CM-NEXT: ALU 7, @50, KC0[], KC1[]
; CM-NEXT: TEX 0 @26
; CM-NEXT: ALU 7, @58, KC0[], KC1[]
; CM-NEXT: TEX 0 @28
; CM-NEXT: ALU 7, @66, KC0[], KC1[]
; CM-NEXT: TEX 0 @30
; CM-NEXT: ALU 7, @74, KC0[], KC1[]
; CM-NEXT: TEX 0 @32
; CM-NEXT: ALU 5, @82, KC0[], KC1[]
; CM-NEXT: TEX 0 @34
; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 20:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
; CM-NEXT: Fetch clause starting at 22:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
; CM-NEXT: Fetch clause starting at 24:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
; CM-NEXT: Fetch clause starting at 26:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
; CM-NEXT: Fetch clause starting at 28:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
; CM-NEXT: Fetch clause starting at 30:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
; CM-NEXT: Fetch clause starting at 32:
; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
; CM-NEXT: Fetch clause starting at 34:
; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
; CM-NEXT: ALU clause starting at 36:
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: MOV * T5.X, 0.0,
; CM-NEXT: ALU clause starting at 38:
; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 44:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 50:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 58:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 66:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -65281(nan), 8(1.121039e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 74:
; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -65281(nan), 8(1.121039e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 82:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.Y,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 88:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T5.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W,
; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <8 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; SI-LABEL: v8i16_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v8i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v8i16_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v8i16_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @36, KC0[], KC1[]
; EG-NEXT: TEX 0 @20
; EG-NEXT: ALU 5, @38, KC0[], KC1[]
; EG-NEXT: TEX 0 @22
; EG-NEXT: ALU 5, @44, KC0[], KC1[]
; EG-NEXT: TEX 0 @24
; EG-NEXT: ALU 5, @50, KC0[], KC1[]
; EG-NEXT: TEX 0 @26
; EG-NEXT: ALU 5, @56, KC0[], KC1[]
; EG-NEXT: TEX 0 @28
; EG-NEXT: ALU 5, @62, KC0[], KC1[]
; EG-NEXT: TEX 0 @30
; EG-NEXT: ALU 5, @68, KC0[], KC1[]
; EG-NEXT: TEX 0 @32
; EG-NEXT: ALU 5, @74, KC0[], KC1[]
; EG-NEXT: TEX 0 @34
; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 20:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
; EG-NEXT: Fetch clause starting at 24:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
; EG-NEXT: Fetch clause starting at 26:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
; EG-NEXT: Fetch clause starting at 28:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
; EG-NEXT: Fetch clause starting at 30:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
; EG-NEXT: Fetch clause starting at 32:
; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
; EG-NEXT: Fetch clause starting at 34:
; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
; EG-NEXT: ALU clause starting at 36:
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: MOV * T7.X, 0.0,
; EG-NEXT: ALU clause starting at 38:
; EG-NEXT: LSHL T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 44:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 50:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 56:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 62:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 68:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 74:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 80:
; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
; EG-NEXT: AND_INT * T1.W, T7.X, literal.z,
; EG-NEXT: 2(2.802597e-45), -65536(nan)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T7.X, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.X,
; EG-NEXT: MOV * T7.W, T3.X,
; EG-NEXT: MOV * T7.Y, T5.X,
;
; CM-LABEL: v8i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @36, KC0[], KC1[]
; CM-NEXT: TEX 0 @20
; CM-NEXT: ALU 5, @38, KC0[], KC1[]
; CM-NEXT: TEX 0 @22
; CM-NEXT: ALU 5, @44, KC0[], KC1[]
; CM-NEXT: TEX 0 @24
; CM-NEXT: ALU 5, @50, KC0[], KC1[]
; CM-NEXT: TEX 0 @26
; CM-NEXT: ALU 5, @56, KC0[], KC1[]
; CM-NEXT: TEX 0 @28
; CM-NEXT: ALU 5, @62, KC0[], KC1[]
; CM-NEXT: TEX 0 @30
; CM-NEXT: ALU 5, @68, KC0[], KC1[]
; CM-NEXT: TEX 0 @32
; CM-NEXT: ALU 5, @74, KC0[], KC1[]
; CM-NEXT: TEX 0 @34
; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 20:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
; CM-NEXT: Fetch clause starting at 22:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
; CM-NEXT: Fetch clause starting at 24:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
; CM-NEXT: Fetch clause starting at 26:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
; CM-NEXT: Fetch clause starting at 28:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
; CM-NEXT: Fetch clause starting at 30:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
; CM-NEXT: Fetch clause starting at 32:
; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
; CM-NEXT: Fetch clause starting at 34:
; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
; CM-NEXT: ALU clause starting at 36:
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: MOV * T7.X, 0.0,
; CM-NEXT: ALU clause starting at 38:
; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 44:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 50:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 56:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 62:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 68:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 74:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 80:
; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
; CM-NEXT: AND_INT * T0.W, T7.X, literal.z,
; CM-NEXT: 2(2.802597e-45), -65536(nan)
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.X,
; CM-NEXT: MOV * T7.W, T3.X,
; CM-NEXT: MOV * T7.Y, T5.X,
entry:
store <8 x i16> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
; SI-LABEL: v8i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v8i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v8i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v8i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, KC0[5].X,
; EG-NEXT: MOV * T0.Z, KC0[4].W,
; EG-NEXT: MOV T0.Y, KC0[4].Z,
; EG-NEXT: MOV * T1.W, KC0[6].X,
; EG-NEXT: MOV T0.X, KC0[4].Y,
; EG-NEXT: MOV * T1.Z, KC0[5].W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.Y, KC0[5].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T1.X, KC0[5].Y,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v8i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.W, KC0[6].X,
; CM-NEXT: MOV * T0.Z, KC0[5].W,
; CM-NEXT: MOV * T0.Y, KC0[5].Z,
; CM-NEXT: MOV T0.X, KC0[5].Y,
; CM-NEXT: MOV * T1.W, KC0[5].X,
; CM-NEXT: MOV T1.Z, KC0[4].W,
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T2.X, PV.W, literal.x,
; CM-NEXT: MOV * T1.Y, KC0[4].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T1.X, KC0[4].Y,
; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind {
; SI-LABEL: v8f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v8f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v8f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v8f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, KC0[5].X,
; EG-NEXT: MOV * T0.Z, KC0[4].W,
; EG-NEXT: MOV T0.Y, KC0[4].Z,
; EG-NEXT: MOV * T1.W, KC0[6].X,
; EG-NEXT: MOV T0.X, KC0[4].Y,
; EG-NEXT: MOV * T1.Z, KC0[5].W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.Y, KC0[5].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T1.X, KC0[5].Y,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v8f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.W, KC0[6].X,
; CM-NEXT: MOV * T0.Z, KC0[5].W,
; CM-NEXT: MOV * T0.Y, KC0[5].Z,
; CM-NEXT: MOV T0.X, KC0[5].Y,
; CM-NEXT: MOV * T1.W, KC0[5].X,
; CM-NEXT: MOV T1.Z, KC0[4].W,
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T2.X, PV.W, literal.x,
; CM-NEXT: MOV * T1.Y, KC0[4].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T1.X, KC0[4].Y,
; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <8 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
; FIXME: Pack/repack on VI
define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; SI-LABEL: v16i8_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v16i8_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v16i8_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v16i8_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @68, KC0[], KC1[]
; EG-NEXT: TEX 0 @36
; EG-NEXT: ALU 5, @70, KC0[], KC1[]
; EG-NEXT: TEX 0 @38
; EG-NEXT: ALU 5, @76, KC0[], KC1[]
; EG-NEXT: TEX 0 @40
; EG-NEXT: ALU 5, @82, KC0[], KC1[]
; EG-NEXT: TEX 0 @42
; EG-NEXT: ALU 5, @88, KC0[], KC1[]
; EG-NEXT: TEX 0 @44
; EG-NEXT: ALU 7, @94, KC0[], KC1[]
; EG-NEXT: TEX 0 @46
; EG-NEXT: ALU 7, @102, KC0[], KC1[]
; EG-NEXT: TEX 0 @48
; EG-NEXT: ALU 7, @110, KC0[], KC1[]
; EG-NEXT: TEX 0 @50
; EG-NEXT: ALU 7, @118, KC0[], KC1[]
; EG-NEXT: TEX 0 @52
; EG-NEXT: ALU 7, @126, KC0[], KC1[]
; EG-NEXT: TEX 0 @54
; EG-NEXT: ALU 7, @134, KC0[], KC1[]
; EG-NEXT: TEX 0 @56
; EG-NEXT: ALU 7, @142, KC0[], KC1[]
; EG-NEXT: TEX 0 @58
; EG-NEXT: ALU 7, @150, KC0[], KC1[]
; EG-NEXT: TEX 0 @60
; EG-NEXT: ALU 5, @158, KC0[], KC1[]
; EG-NEXT: TEX 0 @62
; EG-NEXT: ALU 5, @164, KC0[], KC1[]
; EG-NEXT: TEX 0 @64
; EG-NEXT: ALU 5, @170, KC0[], KC1[]
; EG-NEXT: TEX 0 @66
; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 36:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
; EG-NEXT: Fetch clause starting at 38:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
; EG-NEXT: Fetch clause starting at 40:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
; EG-NEXT: Fetch clause starting at 42:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
; EG-NEXT: Fetch clause starting at 44:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
; EG-NEXT: Fetch clause starting at 46:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
; EG-NEXT: Fetch clause starting at 48:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
; EG-NEXT: Fetch clause starting at 50:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
; EG-NEXT: Fetch clause starting at 52:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
; EG-NEXT: Fetch clause starting at 54:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
; EG-NEXT: Fetch clause starting at 56:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
; EG-NEXT: Fetch clause starting at 58:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
; EG-NEXT: Fetch clause starting at 60:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
; EG-NEXT: Fetch clause starting at 62:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
; EG-NEXT: Fetch clause starting at 64:
; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
; EG-NEXT: Fetch clause starting at 66:
; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
; EG-NEXT: ALU clause starting at 68:
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: MOV * T7.X, 0.0,
; EG-NEXT: ALU clause starting at 70:
; EG-NEXT: LSHL T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 76:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 82:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 88:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 94:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 102:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 110:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 118:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 126:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 134:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 142:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 150:
; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 255(3.573311e-43), -65281(nan)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 158:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T7.W, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 164:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 170:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T7.Y, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.Y,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 176:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T7.X, literal.y,
; EG-NEXT: -256(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT T7.X, PV.W, PS,
; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v16i8_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @68, KC0[], KC1[]
; CM-NEXT: TEX 0 @36
; CM-NEXT: ALU 5, @70, KC0[], KC1[]
; CM-NEXT: TEX 0 @38
; CM-NEXT: ALU 5, @76, KC0[], KC1[]
; CM-NEXT: TEX 0 @40
; CM-NEXT: ALU 5, @82, KC0[], KC1[]
; CM-NEXT: TEX 0 @42
; CM-NEXT: ALU 5, @88, KC0[], KC1[]
; CM-NEXT: TEX 0 @44
; CM-NEXT: ALU 7, @94, KC0[], KC1[]
; CM-NEXT: TEX 0 @46
; CM-NEXT: ALU 7, @102, KC0[], KC1[]
; CM-NEXT: TEX 0 @48
; CM-NEXT: ALU 7, @110, KC0[], KC1[]
; CM-NEXT: TEX 0 @50
; CM-NEXT: ALU 7, @118, KC0[], KC1[]
; CM-NEXT: TEX 0 @52
; CM-NEXT: ALU 7, @126, KC0[], KC1[]
; CM-NEXT: TEX 0 @54
; CM-NEXT: ALU 7, @134, KC0[], KC1[]
; CM-NEXT: TEX 0 @56
; CM-NEXT: ALU 7, @142, KC0[], KC1[]
; CM-NEXT: TEX 0 @58
; CM-NEXT: ALU 7, @150, KC0[], KC1[]
; CM-NEXT: TEX 0 @60
; CM-NEXT: ALU 5, @158, KC0[], KC1[]
; CM-NEXT: TEX 0 @62
; CM-NEXT: ALU 5, @164, KC0[], KC1[]
; CM-NEXT: TEX 0 @64
; CM-NEXT: ALU 5, @170, KC0[], KC1[]
; CM-NEXT: TEX 0 @66
; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 36:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
; CM-NEXT: Fetch clause starting at 38:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
; CM-NEXT: Fetch clause starting at 40:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
; CM-NEXT: Fetch clause starting at 42:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
; CM-NEXT: Fetch clause starting at 44:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
; CM-NEXT: Fetch clause starting at 46:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
; CM-NEXT: Fetch clause starting at 48:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
; CM-NEXT: Fetch clause starting at 50:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
; CM-NEXT: Fetch clause starting at 52:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
; CM-NEXT: Fetch clause starting at 54:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
; CM-NEXT: Fetch clause starting at 56:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
; CM-NEXT: Fetch clause starting at 58:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
; CM-NEXT: Fetch clause starting at 60:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
; CM-NEXT: Fetch clause starting at 62:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
; CM-NEXT: Fetch clause starting at 64:
; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
; CM-NEXT: Fetch clause starting at 66:
; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
; CM-NEXT: ALU clause starting at 68:
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: MOV * T7.X, 0.0,
; CM-NEXT: ALU clause starting at 70:
; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 76:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 82:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 88:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 94:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 102:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 110:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 118:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 126:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -65281(nan), 8(1.121039e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 134:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -65281(nan), 8(1.121039e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 142:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -65281(nan), 8(1.121039e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 150:
; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: -65281(nan), 8(1.121039e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 158:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 164:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 170:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.Y,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 176:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T7.X, literal.y,
; CM-NEXT: -256(nan), 255(3.573311e-43)
; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <16 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; SI-LABEL: v16i16_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v16i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v16i16_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v16i16_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 1, @68, KC0[], KC1[]
; EG-NEXT: TEX 0 @36
; EG-NEXT: ALU 5, @70, KC0[], KC1[]
; EG-NEXT: TEX 0 @38
; EG-NEXT: ALU 5, @76, KC0[], KC1[]
; EG-NEXT: TEX 0 @40
; EG-NEXT: ALU 5, @82, KC0[], KC1[]
; EG-NEXT: TEX 0 @42
; EG-NEXT: ALU 5, @88, KC0[], KC1[]
; EG-NEXT: TEX 0 @44
; EG-NEXT: ALU 5, @94, KC0[], KC1[]
; EG-NEXT: TEX 0 @46
; EG-NEXT: ALU 5, @100, KC0[], KC1[]
; EG-NEXT: TEX 0 @48
; EG-NEXT: ALU 5, @106, KC0[], KC1[]
; EG-NEXT: TEX 0 @50
; EG-NEXT: ALU 5, @112, KC0[], KC1[]
; EG-NEXT: TEX 0 @52
; EG-NEXT: ALU 5, @118, KC0[], KC1[]
; EG-NEXT: TEX 0 @54
; EG-NEXT: ALU 5, @124, KC0[], KC1[]
; EG-NEXT: TEX 0 @56
; EG-NEXT: ALU 5, @130, KC0[], KC1[]
; EG-NEXT: TEX 0 @58
; EG-NEXT: ALU 5, @136, KC0[], KC1[]
; EG-NEXT: TEX 0 @60
; EG-NEXT: ALU 5, @142, KC0[], KC1[]
; EG-NEXT: TEX 0 @62
; EG-NEXT: ALU 5, @148, KC0[], KC1[]
; EG-NEXT: TEX 0 @64
; EG-NEXT: ALU 5, @154, KC0[], KC1[]
; EG-NEXT: TEX 0 @66
; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 36:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
; EG-NEXT: Fetch clause starting at 38:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
; EG-NEXT: Fetch clause starting at 40:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
; EG-NEXT: Fetch clause starting at 42:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
; EG-NEXT: Fetch clause starting at 44:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
; EG-NEXT: Fetch clause starting at 46:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
; EG-NEXT: Fetch clause starting at 48:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
; EG-NEXT: Fetch clause starting at 50:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
; EG-NEXT: Fetch clause starting at 52:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
; EG-NEXT: Fetch clause starting at 54:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
; EG-NEXT: Fetch clause starting at 56:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
; EG-NEXT: Fetch clause starting at 58:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
; EG-NEXT: Fetch clause starting at 60:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
; EG-NEXT: Fetch clause starting at 62:
; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
; EG-NEXT: Fetch clause starting at 64:
; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
; EG-NEXT: Fetch clause starting at 66:
; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
; EG-NEXT: ALU clause starting at 68:
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: MOV * T11.X, 0.0,
; EG-NEXT: ALU clause starting at 70:
; EG-NEXT: LSHL T0.W, T12.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 76:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T7.X,
; EG-NEXT: ALU clause starting at 82:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T7.X, PV.W,
; EG-NEXT: MOV * T0.Y, T9.X,
; EG-NEXT: ALU clause starting at 88:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.Y, T3.X,
; EG-NEXT: ALU clause starting at 94:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T3.X, PV.W,
; EG-NEXT: MOV * T0.Y, T5.X,
; EG-NEXT: ALU clause starting at 100:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T7.X,
; EG-NEXT: ALU clause starting at 106:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T7.X, PV.W,
; EG-NEXT: MOV * T0.Y, T9.X,
; EG-NEXT: ALU clause starting at 112:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 118:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 124:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV * T0.Y, T6.X,
; EG-NEXT: ALU clause starting at 130:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T6.X, PV.W,
; EG-NEXT: MOV * T0.Y, T8.X,
; EG-NEXT: ALU clause starting at 136:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T8.X, PV.W,
; EG-NEXT: MOV * T0.Y, T2.X,
; EG-NEXT: ALU clause starting at 142:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T12.Z, PV.W, PS,
; EG-NEXT: MOV T2.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T4.X,
; EG-NEXT: ALU clause starting at 148:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T12.X, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.X,
; EG-NEXT: MOV * T0.Y, T6.X,
; EG-NEXT: ALU clause starting at 154:
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T13.X, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T11.Z, PV.W, PS,
; EG-NEXT: MOV T6.X, PV.Z,
; EG-NEXT: MOV * T0.Y, T8.X,
; EG-NEXT: ALU clause starting at 160:
; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
; EG-NEXT: AND_INT * T1.W, T11.X, literal.z,
; EG-NEXT: 2(2.802597e-45), -65536(nan)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T11.X, PV.W, PS,
; EG-NEXT: MOV T8.X, PV.X,
; EG-NEXT: MOV * T12.W, T3.X,
; EG-NEXT: MOV T12.Y, T5.X,
; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212
; EG-NEXT: MOV * T11.Y, T9.X,
;
; CM-LABEL: v16i16_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 1, @68, KC0[], KC1[]
; CM-NEXT: TEX 0 @36
; CM-NEXT: ALU 5, @70, KC0[], KC1[]
; CM-NEXT: TEX 0 @38
; CM-NEXT: ALU 5, @76, KC0[], KC1[]
; CM-NEXT: TEX 0 @40
; CM-NEXT: ALU 5, @82, KC0[], KC1[]
; CM-NEXT: TEX 0 @42
; CM-NEXT: ALU 5, @88, KC0[], KC1[]
; CM-NEXT: TEX 0 @44
; CM-NEXT: ALU 5, @94, KC0[], KC1[]
; CM-NEXT: TEX 0 @46
; CM-NEXT: ALU 5, @100, KC0[], KC1[]
; CM-NEXT: TEX 0 @48
; CM-NEXT: ALU 5, @106, KC0[], KC1[]
; CM-NEXT: TEX 0 @50
; CM-NEXT: ALU 5, @112, KC0[], KC1[]
; CM-NEXT: TEX 0 @52
; CM-NEXT: ALU 5, @118, KC0[], KC1[]
; CM-NEXT: TEX 0 @54
; CM-NEXT: ALU 5, @124, KC0[], KC1[]
; CM-NEXT: TEX 0 @56
; CM-NEXT: ALU 5, @130, KC0[], KC1[]
; CM-NEXT: TEX 0 @58
; CM-NEXT: ALU 5, @136, KC0[], KC1[]
; CM-NEXT: TEX 0 @60
; CM-NEXT: ALU 5, @142, KC0[], KC1[]
; CM-NEXT: TEX 0 @62
; CM-NEXT: ALU 5, @148, KC0[], KC1[]
; CM-NEXT: TEX 0 @64
; CM-NEXT: ALU 5, @154, KC0[], KC1[]
; CM-NEXT: TEX 0 @66
; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 36:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
; CM-NEXT: Fetch clause starting at 38:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
; CM-NEXT: Fetch clause starting at 40:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
; CM-NEXT: Fetch clause starting at 42:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
; CM-NEXT: Fetch clause starting at 44:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
; CM-NEXT: Fetch clause starting at 46:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
; CM-NEXT: Fetch clause starting at 48:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
; CM-NEXT: Fetch clause starting at 50:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
; CM-NEXT: Fetch clause starting at 52:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
; CM-NEXT: Fetch clause starting at 54:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
; CM-NEXT: Fetch clause starting at 56:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
; CM-NEXT: Fetch clause starting at 58:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
; CM-NEXT: Fetch clause starting at 60:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
; CM-NEXT: Fetch clause starting at 62:
; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
; CM-NEXT: Fetch clause starting at 64:
; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
; CM-NEXT: Fetch clause starting at 66:
; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
; CM-NEXT: ALU clause starting at 68:
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: MOV * T11.X, 0.0,
; CM-NEXT: ALU clause starting at 70:
; CM-NEXT: LSHL T0.Z, T12.X, literal.x,
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 76:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T7.X,
; CM-NEXT: ALU clause starting at 82:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T7.X, PV.W,
; CM-NEXT: MOV * T0.Y, T9.X,
; CM-NEXT: ALU clause starting at 88:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T9.X, PV.W,
; CM-NEXT: MOV * T0.Y, T3.X,
; CM-NEXT: ALU clause starting at 94:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T3.X, PV.W,
; CM-NEXT: MOV * T0.Y, T5.X,
; CM-NEXT: ALU clause starting at 100:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T5.X, PV.W,
; CM-NEXT: MOV * T0.Y, T7.X,
; CM-NEXT: ALU clause starting at 106:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T7.X, PV.W,
; CM-NEXT: MOV * T0.Y, T9.X,
; CM-NEXT: ALU clause starting at 112:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T9.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 118:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.W,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 124:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.W,
; CM-NEXT: MOV * T0.Y, T6.X,
; CM-NEXT: ALU clause starting at 130:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T6.X, PV.W,
; CM-NEXT: MOV * T0.Y, T8.X,
; CM-NEXT: ALU clause starting at 136:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
; CM-NEXT: MOV T8.X, PV.W,
; CM-NEXT: MOV * T0.Y, T2.X,
; CM-NEXT: ALU clause starting at 142:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W,
; CM-NEXT: MOV T2.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T4.X,
; CM-NEXT: ALU clause starting at 148:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W,
; CM-NEXT: MOV T4.X, PV.X,
; CM-NEXT: MOV * T0.Y, T6.X,
; CM-NEXT: ALU clause starting at 154:
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
; CM-NEXT: AND_INT * T0.W, T13.X, literal.y,
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W,
; CM-NEXT: MOV T6.X, PV.Z,
; CM-NEXT: MOV * T0.Y, T8.X,
; CM-NEXT: ALU clause starting at 160:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T13.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
; CM-NEXT: AND_INT * T0.W, T11.X, literal.z,
; CM-NEXT: 2(2.802597e-45), -65536(nan)
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W,
; CM-NEXT: MOV T8.X, PV.X,
; CM-NEXT: MOV * T12.W, T3.X,
; CM-NEXT: MOV T12.Y, T5.X,
; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212
; CM-NEXT: MOV * T11.Y, T9.X,
entry:
store <16 x i16> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind {
; SI-LABEL: v16i32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s20
; SI-NEXT: v_mov_b32_e32 v1, s21
; SI-NEXT: v_mov_b32_e32 v2, s22
; SI-NEXT: v_mov_b32_e32 v3, s23
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v16i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s20
; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s0, 32
; VI-NEXT: v_mov_b32_e32 v1, s21
; VI-NEXT: v_mov_b32_e32 v2, s22
; VI-NEXT: v_mov_b32_e32 v3, s23
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s16
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v16i32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s24
; GFX9-NEXT: v_mov_b32_e32 v1, s25
; GFX9-NEXT: v_mov_b32_e32 v2, s26
; GFX9-NEXT: v_mov_b32_e32 v3, s27
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s20
; GFX9-NEXT: v_mov_b32_e32 v1, s21
; GFX9-NEXT: v_mov_b32_e32 v2, s22
; GFX9-NEXT: v_mov_b32_e32 v3, s23
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v16i32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MOV * T0.W, KC0[7].X,
; EG-NEXT: MOV * T0.Z, KC0[6].W,
; EG-NEXT: MOV T0.Y, KC0[6].Z,
; EG-NEXT: MOV * T1.W, KC0[8].X,
; EG-NEXT: MOV T0.X, KC0[6].Y,
; EG-NEXT: MOV * T1.Z, KC0[7].W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.Y, KC0[7].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T3.W, KC0[9].X,
; EG-NEXT: MOV T1.X, KC0[7].Y,
; EG-NEXT: MOV * T3.Z, KC0[8].W,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T4.X, PV.W, literal.x,
; EG-NEXT: MOV T3.Y, KC0[8].Z,
; EG-NEXT: MOV * T5.W, KC0[10].X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T3.X, KC0[8].Y,
; EG-NEXT: MOV * T5.Z, KC0[9].W,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, PV.W, literal.x,
; EG-NEXT: MOV T5.Y, KC0[9].Z,
; EG-NEXT: MOV * T5.X, KC0[9].Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v16i32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MOV * T0.W, KC0[10].X,
; CM-NEXT: MOV * T0.Z, KC0[9].W,
; CM-NEXT: MOV * T0.Y, KC0[9].Z,
; CM-NEXT: MOV T0.X, KC0[9].Y,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T2.W, KC0[9].X,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: MOV T2.Z, KC0[8].W,
; CM-NEXT: MOV * T1.W, KC0[8].X,
; CM-NEXT: LSHR T3.X, T1.Z, literal.x,
; CM-NEXT: MOV T2.Y, KC0[8].Z,
; CM-NEXT: MOV * T1.Z, KC0[7].W,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T2.X, KC0[8].Y,
; CM-NEXT: MOV * T1.Y, KC0[7].Z,
; CM-NEXT: MOV T1.X, KC0[7].Y,
; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T4.W, KC0[7].X,
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T5.X, PV.Z, literal.x,
; CM-NEXT: MOV T4.Z, KC0[6].W,
; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T6.X, PV.W, literal.x,
; CM-NEXT: MOV * T4.Y, KC0[6].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T4.X, KC0[6].Y,
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <16 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind {
; SI-LABEL: v16f32_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s20
; SI-NEXT: v_mov_b32_e32 v1, s21
; SI-NEXT: v_mov_b32_e32 v2, s22
; SI-NEXT: v_mov_b32_e32 v3, s23
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v16f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s20
; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s0, 32
; VI-NEXT: v_mov_b32_e32 v1, s21
; VI-NEXT: v_mov_b32_e32 v2, s22
; VI-NEXT: v_mov_b32_e32 v3, s23
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s16
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v16f32_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s24
; GFX9-NEXT: v_mov_b32_e32 v1, s25
; GFX9-NEXT: v_mov_b32_e32 v2, s26
; GFX9-NEXT: v_mov_b32_e32 v3, s27
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s20
; GFX9-NEXT: v_mov_b32_e32 v1, s21
; GFX9-NEXT: v_mov_b32_e32 v2, s22
; GFX9-NEXT: v_mov_b32_e32 v3, s23
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v16f32_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MOV * T0.W, KC0[7].X,
; EG-NEXT: MOV * T0.Z, KC0[6].W,
; EG-NEXT: MOV T0.Y, KC0[6].Z,
; EG-NEXT: MOV * T1.W, KC0[8].X,
; EG-NEXT: MOV T0.X, KC0[6].Y,
; EG-NEXT: MOV * T1.Z, KC0[7].W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.Y, KC0[7].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T3.W, KC0[9].X,
; EG-NEXT: MOV T1.X, KC0[7].Y,
; EG-NEXT: MOV * T3.Z, KC0[8].W,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T4.X, PV.W, literal.x,
; EG-NEXT: MOV T3.Y, KC0[8].Z,
; EG-NEXT: MOV * T5.W, KC0[10].X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV T3.X, KC0[8].Y,
; EG-NEXT: MOV * T5.Z, KC0[9].W,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, PV.W, literal.x,
; EG-NEXT: MOV T5.Y, KC0[9].Z,
; EG-NEXT: MOV * T5.X, KC0[9].Y,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v16f32_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: MOV * T0.W, KC0[10].X,
; CM-NEXT: MOV * T0.Z, KC0[9].W,
; CM-NEXT: MOV * T0.Y, KC0[9].Z,
; CM-NEXT: MOV T0.X, KC0[9].Y,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T2.W, KC0[9].X,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: MOV T2.Z, KC0[8].W,
; CM-NEXT: MOV * T1.W, KC0[8].X,
; CM-NEXT: LSHR T3.X, T1.Z, literal.x,
; CM-NEXT: MOV T2.Y, KC0[8].Z,
; CM-NEXT: MOV * T1.Z, KC0[7].W,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T2.X, KC0[8].Y,
; CM-NEXT: MOV * T1.Y, KC0[7].Z,
; CM-NEXT: MOV T1.X, KC0[7].Y,
; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T4.W, KC0[7].X,
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T5.X, PV.Z, literal.x,
; CM-NEXT: MOV T4.Z, KC0[6].W,
; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T6.X, PV.W, literal.x,
; CM-NEXT: MOV * T4.Y, KC0[6].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T4.X, KC0[6].Y,
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <16 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind {
; SI-LABEL: kernel_arg_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: kernel_arg_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: kernel_arg_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: kernel_arg_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].X,
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: kernel_arg_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.Y, KC0[3].X,
; CM-NEXT: MOV * T0.X, KC0[2].W,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
store i64 %a, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
; SI-LABEL: f64_kernel_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: f64_kernel_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: f64_kernel_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: f64_kernel_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.Y, KC0[3].X,
; EG-NEXT: MOV T0.X, KC0[2].W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: f64_kernel_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MOV * T0.Y, KC0[3].X,
; CM-NEXT: MOV * T0.X, KC0[2].W,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store double %in, ptr addrspace(1) %out
ret void
}
; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
; XGCN: s_load_dwordx2
; XGCN: s_load_dwordx2
; XGCN: buffer_store_dwordx2
; define amdgpu_kernel void @kernel_arg_v1i64(ptr addrspace(1) %out, <1 x i64> %a) nounwind {
; store <1 x i64> %a, ptr addrspace(1) %out, align 8
; ret void
; }
define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
; SI-LABEL: i65_arg:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s6, s[4:5], 0xd
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s8, s6, 1
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:8
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i65_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s6, s[4:5], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s4, s6, 1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_add_u32 s0, s0, 8
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v6, s4
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_byte v[4:5], v6
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i65_arg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s4, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i65_arg:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T1.X, PS, PV.W,
; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T1.Y, 0.0,
; EG-NEXT: MOV * T1.Z, 0.0,
; EG-NEXT: LSHR T0.X, T0.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
; EG-NEXT: LSHR T2.X, PV.W, literal.x,
; EG-NEXT: MOV * T3.X, KC0[3].X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T5.X, KC0[2].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i65_arg:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL T1.X, PV.W, PV.Z,
; CM-NEXT: LSHL * T1.W, literal.x, PV.Z,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: MOV T1.Y, 0.0,
; CM-NEXT: MOV * T1.Z, 0.0,
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV T2.X, KC0[2].W,
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T3.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T4.X, KC0[3].X,
; CM-NEXT: LSHR * T5.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store i65 %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
; SI-LABEL: i1_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s2, 1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i1_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i1_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i1_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.X, 1,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i1_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.X, 1,
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: MOV T0.Y, 0.0,
; CM-NEXT: MOV * T0.Z, 0.0,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
store i1 %x, ptr addrspace(1) %out, align 1
ret void
}
define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
; SI-LABEL: i1_arg_zext_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s2, 1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i1_arg_zext_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i1_arg_zext_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i1_arg_zext_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i1_arg_zext_i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = zext i1 %x to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
; SI-LABEL: i1_arg_zext_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s6, 1
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i1_arg_zext_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i1_arg_zext_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i1_arg_zext_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i1_arg_zext_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T0.Y, 0.0,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = zext i1 %x to i64
store i64 %ext, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
; SI-LABEL: i1_arg_sext_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i1_arg_sext_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x10000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i1_arg_sext_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i1_arg_sext_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i1_arg_sext_i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = sext i1 %x to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
; SI-LABEL: i1_arg_sext_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i1_arg_sext_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i1_arg_sext_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s0, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: i1_arg_sext_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MOV * T0.Y, PV.X,
;
; CM-LABEL: i1_arg_sext_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1,
; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ext = sext i1 %x to i64
store i64 %ext, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
; SI-LABEL: empty_struct_arg:
; SI: ; %bb.0:
; SI-NEXT: s_endpgm
;
; VI-LABEL: empty_struct_arg:
; VI: ; %bb.0:
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: empty_struct_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_endpgm
;
; EGCM-LABEL: empty_struct_arg:
; EGCM: ; %bb.0:
; EGCM-NEXT: CF_END
; EGCM-NEXT: PAD
ret void
}
; The correct load offsets for these:
; load 4 from 0,
; load 8 from 8
; load 4 from 24
; load 8 from 32
; With the SelectionDAG argument lowering, the alignments for the
; struct members is not properly considered, making these wrong.
; FIXME: Total argument size is computed wrong
define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
; SI-LABEL: struct_argument_alignment:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xb
; SI-NEXT: s_load_dword s9, s[4:5], 0xf
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: struct_argument_alignment:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s6, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; VI-NEXT: s_load_dword s7, s[4:5], 0x3c
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x44
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: struct_argument_alignment:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX9-NEXT: s_load_dword s5, s[8:9], 0x18
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x20
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: struct_argument_alignment:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV T0.X, KC0[4].Y,
; EG-NEXT: MOV * T1.X, KC0[4].Z,
; EG-NEXT: MOV T2.X, KC0[3].W,
; EG-NEXT: MOV * T3.X, KC0[2].W,
; EG-NEXT: MOV T4.X, literal.x,
; EG-NEXT: MOV * T5.X, KC0[3].X,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; EG-NEXT: MOV T6.X, literal.x,
; EG-NEXT: MOV * T7.X, KC0[2].Y,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
;
; CM-LABEL: struct_argument_alignment:
; CM: ; %bb.0:
; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[4].Y,
; CM-NEXT: MOV * T1.X, KC0[4].Z,
; CM-NEXT: MOV * T2.X, KC0[3].W,
; CM-NEXT: MOV * T3.X, KC0[2].W,
; CM-NEXT: MOV * T4.X, literal.x,
; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T5.X, KC0[3].X,
; CM-NEXT: MOV * T6.X, literal.x,
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; CM-NEXT: MOV * T7.X, KC0[2].Y,
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg1, 0
%val3 = extractvalue {i32, i64} %arg1, 1
store volatile i32 %val0, ptr addrspace(1) null
store volatile i64 %val1, ptr addrspace(1) null
store volatile i32 %val2, ptr addrspace(1) null
store volatile i64 %val3, ptr addrspace(1) null
ret void
}
; No padding between i8 and next struct, but round up at end to 4 byte
; multiple.
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
; SI-LABEL: packed_struct_argument_alignment:
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v6
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: packed_struct_argument_alignment:
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: s_add_u32 s2, s4, 50
; VI-NEXT: s_addc_u32 s3, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s0, 3
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_add_u32 s0, s4, 51
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v9, v[2:3]
; VI-NEXT: flat_load_ubyte v10, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_load_dword s2, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dword v[2:3], v7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
; VI-NEXT: v_or_b32_e32 v4, v4, v9
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: packed_struct_argument_alignment:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dword v6, v2, s[8:9] offset:13
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[8:9] offset:17
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x4
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, s2
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: global_store_dword v[2:3], v7, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[2:3], v6, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: packed_struct_argument_alignment:
; EG: ; %bb.0:
; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
; EG-NEXT: ALU 2, @25, KC0[], KC1[]
; EG-NEXT: TEX 0 @12
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
; EG-NEXT: TEX 0 @14
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
; EG-NEXT: TEX 0 @16
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3
; EG-NEXT: Fetch clause starting at 14:
; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3
; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3
; EG-NEXT: ALU clause starting at 18:
; EG-NEXT: MOV T0.X, KC0[2].Z,
; EG-NEXT: MOV * T1.X, literal.x,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; EG-NEXT: MOV T2.X, KC0[2].W,
; EG-NEXT: MOV * T3.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; EG-NEXT: MOV * T4.X, KC0[2].Y,
; EG-NEXT: ALU clause starting at 25:
; EG-NEXT: MOV T0.X, 0.0,
; EG-NEXT: MOV * T2.X, 0.0,
; EG-NEXT: MOV * T4.X, 0.0,
;
; CM-LABEL: packed_struct_argument_alignment:
; CM: ; %bb.0:
; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
; CM-NEXT: ALU 2, @25, KC0[], KC1[]
; CM-NEXT: TEX 0 @12
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
; CM-NEXT: TEX 0 @14
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
; CM-NEXT: TEX 0 @16
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 12:
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3
; CM-NEXT: Fetch clause starting at 14:
; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3
; CM-NEXT: Fetch clause starting at 16:
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3
; CM-NEXT: ALU clause starting at 18:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: MOV * T1.X, literal.x,
; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T2.X, KC0[2].W,
; CM-NEXT: MOV * T3.X, literal.x,
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; CM-NEXT: MOV * T4.X, KC0[2].Y,
; CM-NEXT: ALU clause starting at 25:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: MOV * T2.X, 0.0,
; CM-NEXT: MOV * T4.X, 0.0,
%val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1
%val2 = extractvalue <{i32, i64}> %arg1, 0
%val3 = extractvalue <{i32, i64}> %arg1, 1
store volatile i32 %val0, ptr addrspace(1) null
store volatile i64 %val1, ptr addrspace(1) null
store volatile i32 %val2, ptr addrspace(1) null
store volatile i64 %val3, ptr addrspace(1) null
ret void
}
define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
; SI-LABEL: struct_argument_alignment_after:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s12, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb
; SI-NEXT: s_load_dword s13, s[4:5], 0xf
; SI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x11
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: struct_argument_alignment_after:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s10, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c
; VI-NEXT: s_load_dword s11, s[4:5], 0x3c
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: struct_argument_alignment_after:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX9-NEXT: s_load_dword s11, s[8:9], 0x18
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x20
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: struct_argument_alignment_after:
; EG: ; %bb.0:
; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.W, KC0[6].X,
; EG-NEXT: MOV * T0.Z, KC0[5].W,
; EG-NEXT: MOV * T0.Y, KC0[5].Z,
; EG-NEXT: MOV T0.X, KC0[5].Y,
; EG-NEXT: MOV * T1.X, KC0[4].Y,
; EG-NEXT: MOV T2.X, KC0[4].Z,
; EG-NEXT: MOV * T3.X, KC0[3].W,
; EG-NEXT: MOV T4.X, KC0[2].W,
; EG-NEXT: MOV * T5.X, literal.x,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; EG-NEXT: MOV T6.X, KC0[3].X,
; EG-NEXT: MOV * T7.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; EG-NEXT: MOV * T8.X, KC0[2].Y,
;
; CM-LABEL: struct_argument_alignment_after:
; CM: ; %bb.0:
; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T0.W, KC0[6].X,
; CM-NEXT: MOV * T0.Z, KC0[5].W,
; CM-NEXT: MOV * T0.Y, KC0[5].Z,
; CM-NEXT: MOV * T0.X, KC0[5].Y,
; CM-NEXT: MOV * T1.X, KC0[4].Y,
; CM-NEXT: MOV * T2.X, KC0[4].Z,
; CM-NEXT: MOV * T3.X, KC0[3].W,
; CM-NEXT: MOV * T4.X, KC0[2].W,
; CM-NEXT: MOV * T5.X, literal.x,
; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T6.X, KC0[3].X,
; CM-NEXT: MOV * T7.X, literal.x,
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; CM-NEXT: MOV * T8.X, KC0[2].Y,
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg2, 0
%val3 = extractvalue {i32, i64} %arg2, 1
store volatile i32 %val0, ptr addrspace(1) null
store volatile i64 %val1, ptr addrspace(1) null
store volatile i32 %val2, ptr addrspace(1) null
store volatile i64 %val3, ptr addrspace(1) null
store volatile <4 x i32> %arg4, ptr addrspace(1) null
ret void
}
define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
; SI-LABEL: array_3xi32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: array_3xi32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: flat_store_dword v[0:1], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: array_3xi32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: global_store_short v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: array_3xi32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: MOV T1.X, KC0[2].Z,
; EG-NEXT: MOV * T2.X, KC0[2].W,
; EG-NEXT: MOV T3.X, KC0[3].X,
; EG-NEXT: MOV * T4.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
;
; CM-LABEL: array_3xi32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @10, KC0[], KC1[]
; CM-NEXT: TEX 0 @8
; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: AND_INT T0.X, T0.X, literal.x,
; CM-NEXT: MOV * T0.W, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T0.Y, 0.0,
; CM-NEXT: MOV * T0.Z, 0.0,
; CM-NEXT: MOV * T1.X, KC0[2].Z,
; CM-NEXT: MOV * T2.X, KC0[2].W,
; CM-NEXT: MOV * T3.X, KC0[3].X,
; CM-NEXT: MOV * T4.X, literal.x,
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
store volatile i16 %arg0, ptr addrspace(1) poison
store volatile [3 x i32] %arg1, ptr addrspace(1) poison
ret void
}
; FIXME: Why not all scalar loads?
define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
; SI-LABEL: array_3xi16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:42
; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:40
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:38
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v3, s0
; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_short v1, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_short v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: array_3xi16:
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s4, 38
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: s_add_u32 s2, s0, 2
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s4, 42
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v4, v[4:5]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_byte v[0:1], v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_short v[0:1], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: array_3xi16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] offset:6
; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] offset:4
; GFX9-NEXT: global_load_ushort v3, v0, s[8:9] offset:2
; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v3, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: array_3xi16:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @20, KC0[], KC1[]
; EG-NEXT: TEX 1 @12
; EG-NEXT: ALU 11, @21, KC0[], KC1[]
; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; EG-NEXT: TEX 0 @16
; EG-NEXT: ALU 3, @33, KC0[], KC1[]
; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; EG-NEXT: TEX 0 @18
; EG-NEXT: ALU 3, @37, KC0[], KC1[]
; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3
; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3
; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
; EG-NEXT: Fetch clause starting at 18:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3
; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 21:
; EG-NEXT: AND_INT T1.X, T1.X, literal.x,
; EG-NEXT: MOV * T1.W, literal.x,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV * T1.Y, 0.0,
; EG-NEXT: AND_INT T2.X, T2.X, literal.x,
; EG-NEXT: MOV * T2.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T2.Y, 0.0,
; EG-NEXT: MOV T1.Z, 0.0,
; EG-NEXT: MOV * T2.Z, 0.0,
; EG-NEXT: MOV * T3.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 33:
; EG-NEXT: AND_INT T2.X, T1.X, literal.x,
; EG-NEXT: MOV T2.Y, 0.0,
; EG-NEXT: MOV * T2.Z, 0.0,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 37:
; EG-NEXT: AND_INT T2.X, T0.X, literal.x,
; EG-NEXT: MOV T2.Y, 0.0,
; EG-NEXT: MOV * T2.Z, 0.0,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
;
; CM-LABEL: array_3xi16:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @20, KC0[], KC1[]
; CM-NEXT: TEX 1 @12
; CM-NEXT: ALU 11, @21, KC0[], KC1[]
; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X
; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; CM-NEXT: TEX 0 @16
; CM-NEXT: ALU 3, @33, KC0[], KC1[]
; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; CM-NEXT: TEX 0 @18
; CM-NEXT: ALU 3, @37, KC0[], KC1[]
; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 12:
; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3
; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3
; CM-NEXT: Fetch clause starting at 16:
; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
; CM-NEXT: Fetch clause starting at 18:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3
; CM-NEXT: ALU clause starting at 20:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 21:
; CM-NEXT: AND_INT T1.X, T1.X, literal.x,
; CM-NEXT: MOV * T1.W, literal.x,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: MOV * T1.Y, 0.0,
; CM-NEXT: AND_INT T2.X, T2.X, literal.x,
; CM-NEXT: MOV * T2.W, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T2.Y, 0.0,
; CM-NEXT: MOV * T1.Z, 0.0,
; CM-NEXT: MOV * T2.Z, 0.0,
; CM-NEXT: MOV * T3.X, literal.x,
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 33:
; CM-NEXT: AND_INT T2.X, T1.X, literal.x,
; CM-NEXT: MOV T2.Y, 0.0,
; CM-NEXT: MOV * T2.Z, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 37:
; CM-NEXT: AND_INT T2.X, T0.X, literal.x,
; CM-NEXT: MOV T2.Y, 0.0,
; CM-NEXT: MOV * T2.Z, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
store volatile i8 %arg0, ptr addrspace(1) poison
store volatile [3 x i16] %arg1, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
; SI-LABEL: small_array_round_down_offset:
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: small_array_round_down_offset:
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s4, 37
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_byte v[0:1], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: small_array_round_down_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EGCM-LABEL: small_array_round_down_offset:
; EGCM: ; %bb.0:
; EGCM-NEXT: ALU 0, @8, KC0[], KC1[]
; EGCM-NEXT: TEX 0 @6
; EGCM-NEXT: ALU 6, @9, KC0[], KC1[]
; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EGCM-NEXT: CF_END
; EGCM-NEXT: PAD
; EGCM-NEXT: Fetch clause starting at 6:
; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3
; EGCM-NEXT: ALU clause starting at 8:
; EGCM-NEXT: MOV * T0.X, 0.0,
; EGCM-NEXT: ALU clause starting at 9:
; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x,
; EGCM-NEXT: MOV * T0.W, literal.x,
; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EGCM-NEXT: MOV T0.Y, 0.0,
; EGCM-NEXT: MOV * T0.Z, 0.0,
; EGCM-NEXT: MOV * T1.X, literal.x,
; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
%val = extractvalue [1 x i8] %arg, 0
store volatile i8 %val, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
; SI-LABEL: byref_align_constant_i32_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x49
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: byref_align_constant_i32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x124
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: byref_align_constant_i32_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x100
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: byref_align_constant_i32_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[18].Y,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV T1.X, KC0[18].Z,
; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: byref_align_constant_i32_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[18].Y,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T1.X, KC0[18].Z,
; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
; SI-LABEL: byref_natural_align_constant_v16i32_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0x29
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s20
; SI-NEXT: v_mov_b32_e32 v1, s21
; SI-NEXT: v_mov_b32_e32 v2, s22
; SI-NEXT: v_mov_b32_e32 v3, s23
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_mov_b32_e32 v3, s15
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: byref_natural_align_constant_v16i32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s4, s[4:5], 0xa4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s20
; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s0, 32
; VI-NEXT: v_mov_b32_e32 v1, s21
; VI-NEXT: v_mov_b32_e32 v2, s22
; VI-NEXT: v_mov_b32_e32 v3, s23
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s16
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: byref_natural_align_constant_v16i32_arg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x80
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s24
; GFX9-NEXT: v_mov_b32_e32 v1, s25
; GFX9-NEXT: v_mov_b32_e32 v2, s26
; GFX9-NEXT: v_mov_b32_e32 v3, s27
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s20
; GFX9-NEXT: v_mov_b32_e32 v1, s21
; GFX9-NEXT: v_mov_b32_e32 v2, s22
; GFX9-NEXT: v_mov_b32_e32 v3, s23
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: byref_natural_align_constant_v16i32_arg:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @16
; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @18
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @20
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @22
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 16:
; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
; EG-NEXT: Fetch clause starting at 18:
; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
; EG-NEXT: Fetch clause starting at 20:
; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 24:
; EG-NEXT: MOV * T0.X, KC0[6].Y,
; EG-NEXT: ALU clause starting at 25:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 29:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 33:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 37:
; EG-NEXT: MOV T1.X, KC0[10].Y,
; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: byref_natural_align_constant_v16i32_arg:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @16
; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @18
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @20
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @22
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 16:
; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
; CM-NEXT: Fetch clause starting at 18:
; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
; CM-NEXT: Fetch clause starting at 20:
; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
; CM-NEXT: Fetch clause starting at 22:
; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 24:
; CM-NEXT: MOV * T0.X, KC0[6].Y,
; CM-NEXT: ALU clause starting at 25:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T2.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 29:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 33:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 37:
; CM-NEXT: MOV * T1.X, KC0[10].Y,
; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%in = load <16 x i32>, ptr addrspace(4) %in.byref
store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
ret void
}