llvm-project/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
Matt Arsenault a216358ce7
AMDGPU: Replace amdgpu-no-agpr with amdgpu-agpr-alloc (#129893)
This performs the minimal replacment of amdgpu-no-agpr to
amdgpu-agpr-alloc=0. Most of the test diffs are due to the new
attribute sorting later alphabetically.

We could do better by trying to perform range merging in the attributor,
and trying to pick non-0 values.
2025-03-06 09:17:51 +07:00

1366 lines
50 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX942 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s
define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
; GFX942-LABEL: ptr1_i8:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB0_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB0_0:
; GFX942-NEXT: s_and_b32 s0, s4, 0xff
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_i8:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 {
; GFX942-LABEL: ptr1_i8_zext_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB1_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB1_0:
; GFX942-NEXT: s_and_b32 s0, s4, 0xff
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_i8_zext_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 {
; GFX942-LABEL: ptr1_i16_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB2_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB2_0:
; GFX942-NEXT: s_and_b32 s0, s4, 0xffff
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_i16_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB2_0:
; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 {
; GFX942-LABEL: ptr1_i32_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB3_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB3_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB3_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB3_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 {
; GFX942-LABEL: i32_ptr1_i32_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_load_dword s6, s[0:1], 0x10
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB4_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB4_0:
; GFX942-NEXT: s_add_i32 s0, s2, s6
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[4:5]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB4_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB4_0:
; GFX90a-NEXT: s_add_i32 s0, s6, s10
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 {
; GFX942-LABEL: ptr1_i16_i16_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB5_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB5_0:
; GFX942-NEXT: s_lshr_b32 s0, s4, 16
; GFX942-NEXT: s_and_b32 s1, s4, 0xffff
; GFX942-NEXT: s_add_i32 s0, s1, s0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB5_0:
; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
; GFX90a-NEXT: s_add_i32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
%add = add i32 %ext, %ext1
store i32 %add, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 {
; GFX942-LABEL: ptr1_v2i8_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB6_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB6_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_short v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_v2i8_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB6_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
; GFX942-LABEL: byref_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB7_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB7_0:
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: v_mov_b32_e32 v2, s5
; GFX942-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB7_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
ret void
}
; The second argument is not expected to be preloaded with the current behavior.
define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 inreg %after.offset) #0 {
; GFX942-LABEL: byref_staggered_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB8_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB8_0:
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: v_mov_b32_e32 v2, s5
; GFX942-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB8_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 {
; GFX942-LABEL: v8i32_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB9_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB9_0:
; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, s8
; GFX942-NEXT: v_mov_b32_e32 v1, s9
; GFX942-NEXT: v_mov_b32_e32 v2, s10
; GFX942-NEXT: v_mov_b32_e32 v3, s11
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mov_b32_e32 v0, s4
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: v_mov_b32_e32 v2, s6
; GFX942-NEXT: v_mov_b32_e32 v3, s7
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v8i32_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-NEXT: v_mov_b32_e32 v3, s11
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 {
; GFX942-LABEL: v3i16_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB10_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB10_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: global_store_short v0, v1, s[2:3] offset:4
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v3i16_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 {
; GFX942-LABEL: v3i32_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB11_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB11_0:
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v3i32_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 {
; GFX942-LABEL: v3f32_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB12_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB12_0:
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v3f32_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 {
; GFX942-LABEL: v5i8_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB13_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB13_0:
; GFX942-NEXT: s_lshr_b32 s1, s4, 24
; GFX942-NEXT: s_and_b32 s0, s4, 0xffff
; GFX942-NEXT: s_lshl_b32 s1, s1, 8
; GFX942-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX942-NEXT: s_or_b32 s1, s4, s1
; GFX942-NEXT: s_lshl_b32 s1, s1, 16
; GFX942-NEXT: s_or_b32 s0, s0, s1
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: global_store_byte v0, v1, s[2:3] offset:4
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v5i8_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 {
; GFX942-LABEL: v5f64_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB14_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB14_0:
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
; GFX942-NEXT: v_mov_b32_e32 v0, s8
; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
; GFX942-NEXT: v_mov_b32_e32 v1, s9
; GFX942-NEXT: v_mov_b32_e32 v2, s10
; GFX942-NEXT: v_mov_b32_e32 v3, s11
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mov_b32_e32 v0, s4
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: v_mov_b32_e32 v2, s6
; GFX942-NEXT: v_mov_b32_e32 v3, s7
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v5f64_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-NEXT: v_mov_b32_e32 v3, s11
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 {
; GFX942-LABEL: v8i8_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB15_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB15_0:
; GFX942-NEXT: s_lshr_b32 s1, s5, 24
; GFX942-NEXT: s_and_b32 s0, s5, 0xffff
; GFX942-NEXT: s_lshl_b32 s1, s1, 8
; GFX942-NEXT: s_bfe_u32 s5, s5, 0x80010
; GFX942-NEXT: s_or_b32 s1, s5, s1
; GFX942-NEXT: s_lshl_b32 s1, s1, 16
; GFX942-NEXT: s_lshr_b32 s5, s4, 24
; GFX942-NEXT: s_or_b32 s0, s0, s1
; GFX942-NEXT: s_and_b32 s1, s4, 0xffff
; GFX942-NEXT: s_lshl_b32 s5, s5, 8
; GFX942-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX942-NEXT: s_or_b32 s4, s4, s5
; GFX942-NEXT: s_lshl_b32 s4, s4, 16
; GFX942-NEXT: s_or_b32 s1, s1, s4
; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v8i8_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
; GFX90a-NEXT: s_lshr_b32 s1, s9, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
; GFX90a-NEXT: s_lshr_b32 s2, s8, 24
; GFX90a-NEXT: s_lshl_b32 s2, s2, 8
; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010
; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s2, s3, s2
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
; GFX90a-NEXT: s_lshl_b32 s2, s2, 16
; GFX90a-NEXT: s_or_b32 s1, s1, s2
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 {
; GFX942-LABEL: i64_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB16_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB16_0:
; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i64_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 {
; GFX942-LABEL: f64_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB17_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB17_0:
; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: f64_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 {
; GFX942-LABEL: half_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB18_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB18_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_short v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: half_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 {
; GFX942-LABEL: bfloat_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB19_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB19_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_short v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 {
; GFX942-LABEL: v2bfloat_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB20_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB20_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB20_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 {
; GFX942-LABEL: v3bfloat_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB21_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB21_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: global_store_short v0, v1, s[2:3] offset:4
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB21_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 {
; GFX942-LABEL: v6bfloat_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB22_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB22_0:
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
; GFX942-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB23_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB23_0:
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b32_e32 v0, s4
; GFX942-NEXT: global_store_short v3, v0, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v0, s9
; GFX942-NEXT: global_store_short v3, v0, s[10:11] offset:12
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB23_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB23_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
; GFX90a-NEXT: v_mov_b32_e32 v0, s13
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 {
; GFX942-LABEL: i1_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB24_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB24_0:
; GFX942-NEXT: s_and_b32 s0, s4, 1
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_byte v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i1_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB24_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB24_0:
; GFX90a-NEXT: s_and_b32 s0, s8, 1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_byte v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 {
; GFX942-LABEL: fp128_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB25_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB25_0:
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: v_mov_b32_e32 v3, s9
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: fp128_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB25_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB25_0:
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v3, s13
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 {
; GFX942-LABEL: v7i8_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB26_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB26_0:
; GFX942-NEXT: s_lshr_b32 s1, s4, 24
; GFX942-NEXT: s_and_b32 s0, s4, 0xffff
; GFX942-NEXT: s_lshl_b32 s1, s1, 8
; GFX942-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX942-NEXT: s_or_b32 s1, s4, s1
; GFX942-NEXT: s_lshl_b32 s1, s1, 16
; GFX942-NEXT: s_or_b32 s0, s0, s1
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:6
; GFX942-NEXT: global_store_short v0, v1, s[2:3] offset:4
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v7i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB26_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB26_0:
; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 {
; GFX942-LABEL: v7half_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB27_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB27_0:
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b32_e32 v0, s9
; GFX942-NEXT: global_store_short v3, v0, s[2:3] offset:12
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: v7half_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB27_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB27_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s13
; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
; GFX942-LABEL: i16_i32_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB28_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB28_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_short v0, v1, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v1, s5
; GFX942-NEXT: global_store_dword v0, v1, s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i16_i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB28_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB28_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
; GFX942-LABEL: i16_v3i32_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB29_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB29_0:
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: v_mov_b32_e32 v0, s6
; GFX942-NEXT: v_mov_b32_e32 v1, s7
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: global_store_short v3, v4, s[2:3]
; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB29_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB29_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v4, s8
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
; GFX942-LABEL: i16_i16_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB30_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB30_0:
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_short v0, v1, s[2:3]
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i16_i16_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB30_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB30_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
; GFX942-LABEL: i16_v2i8_kernel_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB31_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB31_0:
; GFX942-NEXT: s_lshr_b32 s0, s4, 24
; GFX942-NEXT: s_lshl_b32 s0, s0, 8
; GFX942-NEXT: s_bfe_u32 s1, s4, 0x80010
; GFX942-NEXT: s_or_b32 s0, s1, s0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s4
; GFX942-NEXT: global_store_short v0, v1, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_short v0, v1, s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB31_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB31_0:
; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
; GFX90a-NEXT: s_lshl_b32 s0, s0, 8
; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010
; GFX90a-NEXT: s_or_b32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_short v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
ret void
}
; The second argument is not expected to be preloaded with the current behavior.
define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 {
; GFX942-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB32_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB32_0:
; GFX942-NEXT: s_load_dword s3, s[0:1], 0x10
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_add_i32 s0, s2, s3
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[4:5]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB32_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB32_0:
; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_add_i32 s2, s6, s2
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 {
; GFX942-LABEL: ptr1_i8_trailing_unused:
; GFX942: ; %bb.1:
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX942-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB33_0
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB33_0:
; GFX942-NEXT: s_and_b32 s0, s4, 0xff
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: ptr1_i8_trailing_unused:
; GFX90a: ; %bb.1:
; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB33_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB33_0:
; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
}
attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }