[NFC][AMDGPU] Update tests to use autogened CHECKs (#140311)

This commit is contained in:
Chinmay Deshpande 2025-05-16 17:22:19 -07:00 committed by GitHub
parent 286ab11dc6
commit 437195efbf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 11027 additions and 1582 deletions

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s
@ -10,21 +11,70 @@
; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
; instructions with B64, U64, and I64 take 64-bit operands.
; FUNC-LABEL: {{^}}local_address_load:
; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
define amdgpu_kernel void @local_address_load(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GFX7-LABEL: local_address_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_load:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: ds_read_b32 v0, v0
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
entry:
%0 = load i32, ptr addrspace(3) %in
store i32 %0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}local_address_gep:
; SI: s_add_i32 [[SPTR:s[0-9]]]
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; SI: ds_read_b32 [[VPTR]]
define amdgpu_kernel void @local_address_gep(ptr addrspace(1) %out, ptr addrspace(3) %in, i32 %offset) {
; GFX7-LABEL: local_address_gep:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b32 s3, s3, 2
; GFX7-NEXT: s_add_i32 s2, s2, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_gep:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b32 s3, s3, 2
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: ds_read_b32 v0, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
entry:
%0 = getelementptr i32, ptr addrspace(3) %in, i32 %offset
%1 = load i32, ptr addrspace(3) %0
@ -32,10 +82,34 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}local_address_gep_const_offset:
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
define amdgpu_kernel void @local_address_gep_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GFX7-LABEL: local_address_gep_const_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_read_b32 v0, v0 offset:4
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_gep_const_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: ds_read_b32 v0, v0 offset:4
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
entry:
%0 = getelementptr i32, ptr addrspace(3) %in, i32 1
%1 = load i32, ptr addrspace(3) %0
@ -44,11 +118,36 @@ entry:
}
; Offset too large, can't fold into 16-bit immediate offset.
; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; SI: ds_read_b32 [[VPTR]]
define amdgpu_kernel void @local_address_gep_large_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GFX7-LABEL: local_address_gep_large_const_offset:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s2, s2, 0x10004
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_gep_large_const_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s2, s2, 0x10004
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: ds_read_b32 v0, v0
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
entry:
%0 = getelementptr i32, ptr addrspace(3) %in, i32 16385
%1 = load i32, ptr addrspace(3) %0
@ -56,24 +155,70 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
; GFX7 v_cmp_ne_u32
; GFX7: s_cselect_b32
; GFX8: s_cmp_lg_u32
; GFX8-NOT: v_cmp_ne_u32
; GFX8: s_cselect_b32
define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind {
; GFX7-LABEL: null_32bit_lds_ptr:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_movk_i32 s4, 0x7b
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_cmp_lg_u32 s6, 0
; GFX7-NEXT: s_cselect_b32 s4, s4, 0x1c8
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: null_32bit_lds_ptr:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_movk_i32 s4, 0x7b
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_cmp_lg_u32 s6, 0
; GFX8-NEXT: s_cselect_b32 s4, s4, 0x1c8
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
%cmp = icmp ne ptr addrspace(3) %lds, null
%x = select i1 %cmp, i32 123, i32 456
store i32 %x, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}mul_32bit_ptr:
; SI: s_mul_i32
; SI-NEXT: s_add_i32
; SI: ds_read_b32
define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %tid) {
; GFX7-LABEL: mul_32bit_ptr:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mul_i32 s3, s3, 12
; GFX7-NEXT: s_add_i32 s2, s2, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: mul_32bit_ptr:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mul_i32 s3, s3, 12
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: ds_read_b32 v0, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
%ptr = getelementptr [3 x float], ptr addrspace(3) %lds, i32 %tid, i32 0
%val = load float, ptr addrspace(3) %ptr
store float %val, ptr addrspace(1) %out
@ -82,60 +227,156 @@ define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3)
@g_lds = addrspace(3) global float poison, align 4
; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
define amdgpu_kernel void @infer_ptr_alignment_global_offset(ptr addrspace(1) %out, i32 %tid) {
; GFX7-LABEL: infer_ptr_alignment_global_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: infer_ptr_alignment_global_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: ds_read_b32 v0, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
%val = load float, ptr addrspace(3) @g_lds
store float %val, ptr addrspace(1) %out
ret void
}
@ptr = addrspace(3) global ptr addrspace(3) poison
@dst = addrspace(3) global [16383 x i32] poison
; FUNC-LABEL: {{^}}global_ptr:
; SI: ds_write_b32
define amdgpu_kernel void @global_ptr() nounwind {
; SI-LABEL: global_ptr:
; SI: ; %bb.0:
; SI-NEXT: v_mov_b32_e32 v0, 64
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: ds_write_b32 v1, v0 offset:65532
; SI-NEXT: s_endpgm
store ptr addrspace(3) getelementptr ([16383 x i32], ptr addrspace(3) @dst, i32 0, i32 16), ptr addrspace(3) @ptr
ret void
}
; FUNC-LABEL: {{^}}local_address_store:
; SI: ds_write_b32
define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val) {
; GFX7-LABEL: local_address_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: ds_write_b32 v0, v1
; GFX8-NEXT: s_endpgm
store i32 %val, ptr addrspace(3) %out
ret void
}
; FUNC-LABEL: {{^}}local_address_gep_store:
; SI: s_add_i32 [[SADDR:s[0-9]+]],
; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
; GFX7-LABEL: local_address_gep_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_add_i32 s0, s2, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b32 v1, v0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_gep_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b32 s1, s1, 2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_add_i32 s0, s2, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: ds_write_b32 v1, v0
; GFX8-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %out, i32 %offset
store i32 %val, ptr addrspace(3) %gep, align 4
ret void
}
; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
define amdgpu_kernel void @local_address_gep_const_offset_store(ptr addrspace(3) %out, i32 %val) {
; GFX7-LABEL: local_address_gep_const_offset_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v0, v1 offset:4
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_gep_const_offset_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: ds_write_b32 v0, v1 offset:4
; GFX8-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %out, i32 1
store i32 %val, ptr addrspace(3) %gep, align 4
ret void
}
; Offset too large, can't fold into 16-bit immediate offset.
; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
define amdgpu_kernel void @local_address_gep_large_const_offset_store(ptr addrspace(3) %out, i32 %val) {
; GFX7-LABEL: local_address_gep_large_const_offset_store:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s0, 0x10004
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: ds_write_b32 v1, v0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: local_address_gep_large_const_offset_store:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s0, s0, 0x10004
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: ds_write_b32 v1, v0
; GFX8-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %out, i32 16385
store i32 %val, ptr addrspace(3) %gep, align 4
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; FUNC: {{.*}}

View File

@ -1,11 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; GCN-LABEL: {{^}}select_and1:
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_and_b32
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_and1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_gt_i32 s2, 10
; GCN-NEXT: s_cselect_b32 s2, s3, 0
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
%a = and i32 %y, %s
@ -13,12 +19,17 @@ define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
ret void
}
; GCN-LABEL: {{^}}select_and2:
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_and_b32
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_and2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_gt_i32 s2, 10
; GCN-NEXT: s_cselect_b32 s2, s3, 0
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
%a = and i32 %s, %y
@ -26,12 +37,17 @@ define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
ret void
}
; GCN-LABEL: {{^}}select_and3:
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_and_b32
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_and3:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lt_i32 s2, 11
; GCN-NEXT: s_cselect_b32 s2, s3, 0
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 -1, i32 0
%a = and i32 %y, %s
@ -39,18 +55,25 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
ret void
}
; GCN-LABEL: {{^}}select_and_v4:
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, 0
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, 0
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, 0
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, 0
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
; GCN-NOT: v_and_b32
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
; GCN-LABEL: select_and_v4:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_gt_i32 s8, 10
; GCN-NEXT: s_cselect_b32 s3, s3, 0
; GCN-NEXT: s_cselect_b32 s2, s2, 0
; GCN-NEXT: s_cselect_b32 s1, s1, 0
; GCN-NEXT: s_cselect_b32 s0, s0, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%a = and <4 x i32> %s, %y
@ -58,12 +81,17 @@ define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32>
ret void
}
; GCN-LABEL: {{^}}select_or1:
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_or_b32
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_or1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lt_i32 s2, 11
; GCN-NEXT: s_cselect_b32 s2, s3, -1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
%a = or i32 %y, %s
@ -71,12 +99,17 @@ define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
ret void
}
; GCN-LABEL: {{^}}select_or2:
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_or_b32
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_or2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lt_i32 s2, 11
; GCN-NEXT: s_cselect_b32 s2, s3, -1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
%a = or i32 %s, %y
@ -84,12 +117,17 @@ define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
ret void
}
; GCN-LABEL: {{^}}select_or3:
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_or_b32
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_or3:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_gt_i32 s2, 10
; GCN-NEXT: s_cselect_b32 s2, s3, -1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 -1, i32 0
%a = or i32 %y, %s
@ -97,18 +135,25 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
ret void
}
; GCN-LABEL: {{^}}select_or_v4:
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, -1
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, -1
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, -1
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, -1
; GCN-NOT: v_or_b32
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
; GCN-LABEL: select_or_v4:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lt_i32 s8, 11
; GCN-NEXT: s_cselect_b32 s3, s3, -1
; GCN-NEXT: s_cselect_b32 s2, s2, -1
; GCN-NEXT: s_cselect_b32 s1, s1, -1
; GCN-NEXT: s_cselect_b32 s0, s0, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%a = or <4 x i32> %s, %y
@ -116,192 +161,360 @@ define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %
ret void
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants:
; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sel_constants_sub_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 9, 2
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i32 -4, i32 3
%bo = sub i32 5, %sel
store i32 %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 9, 2
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_short v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i16 -4, i16 3
%bo = sub i16 5, %sel
store i16 %bo, ptr addrspace(1) %p, align 2
ret void
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg:
; GCN: s_cselect_b32 s[[SGPR:[0-9]+]], s[[SGPR]], 0xf449
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16_neg:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_mov_b32 s2, 0xfffd
; GCN-NEXT: s_cselect_b32 s2, s2, 0xf449
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_short v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i16 4, i16 3000
%bo = sub i16 1, %sel
store i16 %bo, ptr addrspace(1) %p, align 2
ret void
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v2i16:
; GCN-DAG: s_mov_b32 [[T:s[0-9]+]], 0x50009
; GCN: s_cselect_b32 s{{[0-9]+}}, [[T]], 0x60002
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sel_constants_sub_constant_sel_constants_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_mov_b32 s2, 0x50009
; GCN-NEXT: s_cselect_b32 s2, s2, 0x60002
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, <2 x i16> <i16 -4, i16 2>, <2 x i16> <i16 3, i16 1>
%bo = sub <2 x i16> <i16 5, i16 7>, %sel
store <2 x i16> %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v4i32:
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], 7, 14
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 6, 10
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 5, 6
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 9, 2
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 7, 14
; GCN-NEXT: s_cselect_b32 s3, 6, 10
; GCN-NEXT: s_cselect_b32 s4, 5, 6
; GCN-NEXT: s_cselect_b32 s5, 9, 2
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
%bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
store <4 x i32> %bo, ptr addrspace(1) %p, align 32
ret void
}
; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64:
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
define amdgpu_kernel void @sdiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sdiv_constant_sel_constants_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 0, 5
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i64 121, i64 23
%bo = sdiv i64 120, %sel
store i64 %bo, ptr addrspace(1) %p, align 8
ret void
}
; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i32:
; GCN: s_cselect_b32 s{{[0-9]+}}, 26, 8
define amdgpu_kernel void @sdiv_constant_sel_constants_i32(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: sdiv_constant_sel_constants_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 26, 8
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i32 7, i32 23
%bo = sdiv i32 184, %sel
store i32 %bo, ptr addrspace(1) %p, align 8
ret void
}
; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64:
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
define amdgpu_kernel void @udiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: udiv_constant_sel_constants_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 0, 5
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i64 -4, i64 23
%bo = udiv i64 120, %sel
store i64 %bo, ptr addrspace(1) %p, align 8
ret void
}
; GCN-LABEL: {{^}}srem_constant_sel_constants:
; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
define amdgpu_kernel void @srem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: srem_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 33, 3
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i64 34, i64 15
%bo = srem i64 33, %sel
store i64 %bo, ptr addrspace(1) %p, align 8
ret void
}
; GCN-LABEL: {{^}}urem_constant_sel_constants:
; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
define amdgpu_kernel void @urem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: urem_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 33, 3
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i64 34, i64 15
%bo = urem i64 33, %sel
store i64 %bo, ptr addrspace(1) %p, align 8
ret void
}
; GCN-LABEL: {{^}}shl_constant_sel_constants:
; GCN: s_cselect_b32 s{{[0-9]+}}, 4, 8
define amdgpu_kernel void @shl_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: shl_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 4, 8
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i32 2, i32 3
%bo = shl i32 1, %sel
store i32 %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}lshr_constant_sel_constants:
; GCN: s_cselect_b32 s{{[0-9]+}}, 16, 8
define amdgpu_kernel void @lshr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: lshr_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 16, 8
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i32 2, i32 3
%bo = lshr i32 64, %sel
store i32 %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}ashr_constant_sel_constants:
; GCN: s_cselect_b32 s{{[0-9]+}}, 32, 16
define amdgpu_kernel void @ashr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: ashr_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 32, 16
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, i32 2, i32 3
%bo = ashr i32 128, %sel
store i32 %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}fsub_constant_sel_constants:
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, -4.0, 1.0,
define amdgpu_kernel void @fsub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: fsub_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v1, -4.0, 1.0, s[2:3]
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, float -2.0, float 3.0
%bo = fsub float -1.0, %sel
store float %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}fsub_constant_sel_constants_f16:
; TODO: it shall be possible to fold constants with OpSel
; GCN-DAG: v_mov_b32_e32 [[T:v[0-9]+]], 0x3c00
; GCN-DAG: v_mov_b32_e32 [[F:v[0-9]+]], 0xc400
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, [[F]], [[T]],
define amdgpu_kernel void @fsub_constant_sel_constants_f16(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: fsub_constant_sel_constants_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0xc400
; GCN-NEXT: v_mov_b32_e32 v2, 0x3c00
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GCN-NEXT: global_store_short v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, half -2.0, half 3.0
%bo = fsub half -1.0, %sel
store half %bo, ptr addrspace(1) %p, align 2
ret void
}
; GCN-LABEL: {{^}}fsub_constant_sel_constants_v2f16:
; GCN: s_cselect_b32 s{{[0-9]+}}, 0x45003c00, -2.0
define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: fsub_constant_sel_constants_v2f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 0x45003c00, -2.0
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, <2 x half> <half -2.0, half -3.0>, <2 x half> <half -1.0, half 4.0>
%bo = fsub <2 x half> <half -1.0, half 2.0>, %sel
store <2 x half> %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}fsub_constant_sel_constants_v4f32:
; GCN: s_mov_b32 [[T0:s[0-9]+]], 0x41500000
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], [[T0]], 0x40c00000
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 0x41100000, 4.0
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 0x40a00000, 2.0
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 1.0, 0
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: fsub_constant_sel_constants_v4f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_mov_b32 s3, 0x41500000
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, s3, 0x40c00000
; GCN-NEXT: s_cselect_b32 s3, 0x41100000, 4.0
; GCN-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0
; GCN-NEXT: s_cselect_b32 s5, 1.0, 0
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
%bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel
store <4 x float> %bo, ptr addrspace(1) %p, align 32
ret void
}
; GCN-LABEL: {{^}}fdiv_constant_sel_constants:
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 4.0, -2.0,
define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: fdiv_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v1, 4.0, -2.0, s[2:3]
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, float -4.0, float 2.0
%bo = fdiv float 8.0, %sel
store float %bo, ptr addrspace(1) %p, align 4
ret void
}
; GCN-LABEL: {{^}}frem_constant_sel_constants:
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0,
define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
; GCN-LABEL: frem_constant_sel_constants:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s2, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[2:3]
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
; GCN-NEXT: s_endpgm
%sel = select i1 %cond, float -4.0, float 3.0
%bo = frem float 5.0, %sel
store float %bo, ptr addrspace(1) %p, align 4

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
@ -6,30 +7,40 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fabs.f32(float) #1
declare double @llvm.fabs.f64(double) #1
; SI-LABEL: {{^}}test_class_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cmp_class_f32_e32 vcc, s7, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_fabs_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_fabs_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], |s7|, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%a.fabs = call float @llvm.fabs.f32(float %a) #1
%result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
%sext = sext i1 %result to i32
@ -37,15 +48,20 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32],
ret void
}
; SI-LABEL: {{^}}test_class_fneg_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_fneg_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -s7, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%a.fneg = fsub float -0.0, %a
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
%sext = sext i1 %result to i32
@ -53,15 +69,20 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32],
ret void
}
; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_fneg_fabs_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -|s7|, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%a.fabs = call float @llvm.fabs.f32(float %a) #1
%a.fneg.fabs = fsub float -0.0, %a.fabs
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@ -70,26 +91,36 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x
ret void
}
; SI-LABEL: {{^}}test_class_1_f32:
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 {
; SI-LABEL: test_class_1_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_64_f32:
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 {
; SI-LABEL: test_class_64_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 64
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
@ -97,42 +128,62 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0
}
; Set all 10 bits of mask
; SI-LABEL: {{^}}test_class_full_mask_f32:
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 {
; SI-LABEL: test_class_full_mask_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3ff
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_9bit_mask_f32:
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 {
; SI-LABEL: test_class_9bit_mask_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x1ff
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}v_test_class_full_mask_f32:
; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_test_class_full_mask_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_movk_i32 s4, 0x1ff
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v2, s4
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -144,13 +195,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr
ret void
}
; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e32 vcc, 1.0, v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -163,14 +224,24 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr a
}
; FIXME: Why isn't this using a literal constant operand?
; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000
; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_class_lit_constant_dynamic_mask_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s4, 0x44800000
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -182,30 +253,40 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspac
ret void
}
; SI-LABEL: {{^}}test_class_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_fabs_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_fabs_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], |s[6:7]|, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%a.fabs = call double @llvm.fabs.f64(double %a) #1
%result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
%sext = sext i1 %result to i32
@ -213,15 +294,20 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32],
ret void
}
; SI-LABEL: {{^}}test_class_fneg_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_fneg_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -s[6:7], v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%a.fneg = fsub double -0.0, %a
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
%sext = sext i1 %result to i32
@ -229,15 +315,20 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32],
ret void
}
; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_class_fneg_fabs_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -|s[6:7]|, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%a.fabs = call double @llvm.fabs.f64(double %a) #1
%a.fneg.fabs = fsub double -0.0, %a.fabs
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@ -246,20 +337,38 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x
ret void
}
; SI-LABEL: {{^}}test_class_1_f64:
; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
; SI: s_endpgm
define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 {
; SI-LABEL: test_class_1_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_64_f64:
; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
; SI: s_endpgm
define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #0 {
; SI-LABEL: test_class_64_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 64
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
@ -267,30 +376,45 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #
}
; Set all 9 bits of mask
; SI-LABEL: {{^}}test_class_full_mask_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
; SI-NOT: vcc
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 {
; SI-LABEL: test_class_full_mask_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x1ff
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}v_test_class_full_mask_f64:
; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
; SI-NOT: vcc
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_test_class_full_mask_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_movk_i32 s4, 0x1ff
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], s4
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -302,11 +426,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr
ret void
}
; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
; XSI: v_cmp_class_f64_e32 vcc, 1.0,
; SI: v_cmp_class_f64_e32 vcc,
; SI: s_endpgm
define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f64_e32 vcc, 1.0, v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -318,10 +454,25 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr a
ret void
}
; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
; SI: s_endpgm
define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_class_lit_constant_dynamic_mask_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s5, 0x40900000
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -333,12 +484,26 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspac
ret void
}
; SI-LABEL: {{^}}test_fold_or_class_f32_0:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_fold_or_class_f32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 3
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -353,12 +518,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr a
ret void
}
; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_fold_or3_class_f32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -375,13 +554,27 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr
ret void
}
; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
; SI-NOT: v_cmp_class
; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}}
; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_fold_or_all_tests_class_f32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_movk_i32 s2, 0x3ff
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s2
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -411,12 +604,26 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %
ret void
}
; SI-LABEL: {{^}}test_fold_or_class_f32_1:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_fold_or_class_f32_1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 12
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -431,12 +638,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr a
ret void
}
; SI-LABEL: {{^}}test_fold_or_class_f32_2:
; SI-NOT: v_cmp_class
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
; SI-NOT: v_cmp_class
; SI: s_endpgm
define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: test_fold_or_class_f32_2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -451,12 +672,29 @@ define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr a
ret void
}
; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
; SI: s_or_b64
; SI: s_endpgm
define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) #0 {
; SI-LABEL: test_no_fold_or_class_f32_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dword s12, s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s12, 8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[2:3], v0, 4
; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -471,72 +709,94 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt
ret void
}
; SI-LABEL: {{^}}test_class_0_f32:
; SI-NOT: v_cmp_class
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 {
; SI-LABEL: test_class_0_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_0_f64:
; SI-NOT: v_cmp_class
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 {
; SI-LABEL: test_class_0_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_class_undef_f32:
; SI-NOT: v_cmp_class
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, i32 %b) #0 {
; SI-LABEL: test_class_undef_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
%result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
ret void
}
; SI-LABEL: {{^}}test_fold_and_ord:
; SI: s_waitcnt
; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
; SI-NEXT: s_setpc_b64
define i1 @test_fold_and_ord(float %a) {
; SI-LABEL: test_fold_and_ord:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 32
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
%ord = fcmp ord float %a, %a
%and = and i1 %ord, %class
ret i1 %and
}
; SI-LABEL: {{^}}test_fold_and_unord:
; SI: s_waitcnt
; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
; SI-NEXT: s_setpc_b64
define i1 @test_fold_and_unord(float %a) {
; SI-LABEL: test_fold_and_unord:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
%ord = fcmp uno float %a, %a
%and = and i1 %ord, %class
ret i1 %and
}
; SI-LABEL: {{^}}test_fold_and_ord_multi_use:
; SI: v_cmp_class
; SI-NOT: v_cmp_class
; SI: v_cmp_o
; SI: s_and_b64
define i1 @test_fold_and_ord_multi_use(float %a) {
; SI-LABEL: test_fold_and_ord_multi_use:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 35
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
store volatile i1 %class, ptr addrspace(1) poison
%ord = fcmp ord float %a, %a

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s
@ -5,162 +6,766 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=R600,FUNC %s
; FUNC-LABEL: {{^}}global_load_f32:
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
; GCN-HSA: flat_load_dword
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load float, ptr addrspace(1) %in
store float %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v2f32:
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx2
; R600: VTX_READ_64
define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v2f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v2f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v2f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <2 x float>, ptr addrspace(1) %in
store <2 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v3f32:
; SI-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx3
; GCNX3-HSA: flat_load_dwordx3
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v3f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v3f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v3f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <3 x float>, ptr addrspace(1) %in
store <3 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v4f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v4f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v4f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v4f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <4 x float>, ptr addrspace(1) %in
store <4 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v8f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v8f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v8f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v8f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <8 x float>, ptr addrspace(1) %in
store <8 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v9f32:
; GCN-NOHSA: buffer_load_dword
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dword
; GCN-HSA: flat_load_dwordx4
; R600: VTX_READ_128
; R600: VTX_READ_32
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v9f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v9f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dword v14, v[6:7]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dword v[10:11], v14
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v9f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <9 x float>, ptr addrspace(1) %in
store <9 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v10f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx2
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v10f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v10f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx2 v[14:15], v[8:9]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v10f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <10 x float>, ptr addrspace(1) %in
store <10 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v11f32:
; SI-NOHSA: buffer_load_dwordx4
; SI-NOHSA: buffer_load_dwordx4
; SI-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx4
; GCNX3-NOHSA: buffer_load_dwordx3
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx3
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v11f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:40
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v11f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx3 v[15:16], v[8:10]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v11f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <11 x float>, ptr addrspace(1) %in
store <11 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v12f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v12f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v12f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v12f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <12 x float>, ptr addrspace(1) %in
store <12 x float> %tmp0, ptr addrspace(1) %out
ret void
}
; FUNC-LABEL: {{^}}global_load_v16f32:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-NOHSA-LABEL: global_load_v16f32:
; SI-NOHSA: ; %bb.0: ; %entry
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
; SI-NOHSA-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v16f32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s2, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: s_nop 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
; GCN-HSA-NEXT: s_endpgm
;
; GCNX3-NOHSA-LABEL: global_load_v16f32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
entry:
%tmp0 = load <16 x float>, ptr addrspace(1) %in
store <16 x float> %tmp0, ptr addrspace(1) %out
@ -168,3 +773,8 @@ entry:
}
attributes #0 = { nounwind }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; FUNC: {{.*}}
; GCN-NOHSA: {{.*}}
; GCNX3-HSA: {{.*}}
; R600: {{.*}}

View File

@ -1,15 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; FIXME: Fails with -enable-var-scope
; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
; Extract the high bit of the low half
; GCN-LABEL: {{^}}v_uextract_bit_31_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_31_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v2, 31, v2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -21,13 +35,24 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addr
}
; Extract the high bit of the high half
; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO1]]]
define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_63_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v2, 31, v2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -38,12 +63,25 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addr
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_1_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_1_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -54,12 +92,25 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrs
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_20_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_20_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 20, 1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -70,13 +121,24 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addr
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_32_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -87,13 +149,24 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addr
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_33_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -104,12 +177,25 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addr
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_20_21_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 20, 2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -120,12 +206,25 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr a
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_1_30_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 30
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -136,12 +235,25 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr ad
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_1_31_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -154,13 +266,26 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr ad
; Spans the dword boundary, so requires full shift.
; Truncated after the shift, so only low shift result is used.
; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_31_32_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 31
; GCN-NEXT: v_and_b32_e32 v2, 3, v2
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -171,13 +296,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_32_33_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -188,14 +324,24 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr a
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_30_60_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 30
; GCN-NEXT: v_and_b32_e32 v2, 0x3fffffff, v2
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -206,13 +352,24 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr a
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_33_63_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 30
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -223,12 +380,25 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr a
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_31_63_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_alignbit_b32 v0, v3, v2, 31
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -240,11 +410,23 @@ define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr a
}
; trunc applied before and mask
; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
; GCN: buffer_store_dword v[[SHIFT]]
define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_31_i64_trunc_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v3
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -256,11 +438,23 @@ define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
; GCN: buffer_store_dword [[BFE]]
define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_3_i64_trunc_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v0, v3, 3, 1
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -272,11 +466,24 @@ define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out,
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
; GCN: buffer_store_dword [[BFE]]
define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_33_i64_trunc_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v0, v3, 1, 1
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -288,13 +495,24 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
; GCN-NOT: v[[SHRLO]]
; GCN: buffer_store_dword v[[SHRLO]]
define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_31_32_i64_trunc_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_alignbit_b32 v0, v4, v3, 31
; GCN-NEXT: v_and_b32_e32 v0, 3, v0
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -306,16 +524,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %
ret void
}
; GCN-LABEL: {{^}}and_not_mask_i64:
; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]]
; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
; GCN-NOT: v[[SHRLO]]
; GCN-NOT: v[[SHRHI]]
; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: and_not_mask_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_and_b32_e32 v2, 4, v2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -328,15 +554,29 @@ define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace
; The instruction count is the same with/without hasOneUse, but
; keeping the 32-bit and has a smaller encoding size than the bfe.
; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
; GCN-DAG: v_lshr_b64 v[[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]], [[VAL]], 27
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_27_29_multi_use_shift_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], 27
; GCN-NEXT: v_and_b32_e32 v0, 3, v2
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -348,15 +588,30 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspac
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
; GCN-DAG: buffer_store_dwordx2 v[[[SHR]]:[[ZERO_SHR]]]
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO_BFE]]]
define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_34_37_multi_use_shift_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v2
; GCN-NEXT: v_bfe_u32 v2, v2, 2, 3
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -368,13 +623,32 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspac
ret void
}
; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[BFE]]:{{[0-9]+\]}}
; GCN: buffer_store_dword v[[ZERO]]
define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #1 {
; GCN-LABEL: v_uextract_bit_33_36_use_upper_half_shift_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v5, v[1:2], s[4:7], 0 addr64 offset:4
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: v_mov_b32_e32 v6, v2
; GCN-NEXT: s_mov_b64 s[10:11], s[2:3]
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[4:5]
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_u32 v5, v5, 1, 3
; GCN-NEXT: buffer_store_dwordx2 v[5:6], v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v2, v[3:4], s[8:11], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
%out0.gep = getelementptr i64, ptr addrspace(1) %out0, i32 %id.x

View File

@ -1,51 +1,150 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) {
; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[4:5],
; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
; SI: buffer_store_dword [[VLOAD]]
; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]]
; EG-LABEL: {{^}}trunc_i64_to_i32_store:
; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG: LSHR
; EG-NEXT: 2(
; SI-LABEL: trunc_i64_to_i32_store:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: trunc_i64_to_i32_store:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dword s2, s[4:5], 0x4c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: trunc_i64_to_i32_store:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.X, KC0[4].W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = trunc i64 %in to i32 store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}trunc_load_shl_i64:
; GCN-DAG: s_load_dwordx2
; GCN-DAG: s_load_dword [[SREG:s[0-9]+]],
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
; GCN: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
; SI: buffer_store_dword [[VSHL]]
; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
define amdgpu_kernel void @trunc_load_shl_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
; SI-LABEL: trunc_load_shl_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s4, s6, 2
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: trunc_load_shl_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: trunc_load_shl_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: LSHL * T1.X, KC0[4].W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b = shl i64 %a, 2
%result = trunc i64 %b to i32
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}trunc_shl_i64:
; SI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN: s_lshl_b64 s[[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s[[[LO_SREG]]:{{[0-9]+\]}}, 2
; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
; SI: buffer_store_dword v[[LO_VREG]],
; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
; GCN: v_mov_b32_e32
; GCN: v_mov_b32_e32
define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1) %out, i64 %a) {
; SI-LABEL: trunc_shl_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
; SI-NEXT: s_add_u32 s8, s8, 0x3a8
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s0, s2
; SI-NEXT: s_mov_b32 s1, s3
; SI-NEXT: s_mov_b32 s2, s6
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_addc_u32 s9, s9, 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: trunc_shl_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], 2
; VI-NEXT: s_add_u32 s0, s0, 0x3a8
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: trunc_shl_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[2].W, literal.x,
; EG-NEXT: ADDC_UINT * T2.W, PV.W, literal.y,
; EG-NEXT: 30(4.203895e-44), 936(1.311615e-42)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT T1.Y, PV.W, PS,
; EG-NEXT: ADD_INT * T1.X, T0.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 936(1.311615e-42)
; EG-NEXT: LSHR * T2.X, KC0[2].Z, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%aa = add i64 %a, 234 ; Prevent shrinking store.
%b = shl i64 %aa, 2
%result = trunc i64 %b to i32
@ -54,9 +153,55 @@ define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1
ret void
}
; GCN-LABEL: {{^}}trunc_i32_to_i1:
; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
; SI-LABEL: trunc_i32_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s10, s6
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: trunc_i32_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 1, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: trunc_i32_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.X, T0.X, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load i32, ptr addrspace(1) %ptr, align 4
%trunc = trunc i32 %a to i1
%result = select i1 %trunc, i32 1, i32 0
@ -64,9 +209,64 @@ define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(
ret void
}
; GCN-LABEL: {{^}}trunc_i8_to_i1:
; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
; SI-LABEL: trunc_i8_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s10, s6
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: trunc_i8_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 1, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: trunc_i8_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.X, 1,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load i8, ptr addrspace(1) %ptr, align 4
%trunc = trunc i8 %a to i1
%result = select i1 %trunc, i8 1, i8 0
@ -74,43 +274,213 @@ define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
define amdgpu_kernel void @sgpr_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %a) {
; SI-LABEL: sgpr_trunc_i16_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s6, 1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: sgpr_trunc_i16_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: sgpr_trunc_i16_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.X, 1,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%trunc = trunc i16 %a to i1
%result = select i1 %trunc, i16 1, i16 0
store i16 %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
; SI-LABEL: sgpr_trunc_i32_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s6, 1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: sgpr_trunc_i32_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: sgpr_trunc_i32_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T1.X, KC0[2].Z, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%trunc = trunc i32 %a to i1
%result = select i1 %trunc, i32 1, i32 0
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: s_bitcmp1_b32 s[[SLO]], 0
; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
; SI-LABEL: s_trunc_i64_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s6, 0
; SI-NEXT: s_cselect_b32 s4, 63, -12
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_trunc_i64_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s0, 0
; VI-NEXT: s_cselect_b32 s0, 63, -12
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_trunc_i64_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV T0.W, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[4].W, 1,
; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, PS, literal.x, PV.W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: -12(nan), 2(2.802597e-45)
%trunc = trunc i64 %x to i1
%sel = select i1 %trunc, i32 63, i32 -12
store i32 %sel, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}v_trunc_i64_to_i1:
; SI: buffer_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
; VI: flat_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_trunc_i64_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 1, v3
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, -12, 63, vcc
; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_trunc_i64_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 1, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; VI-NEXT: v_cndmask_b32_e64 v0, -12, 63, vcc
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_trunc_i64_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
; EG-NEXT: AND_INT T0.W, T1.X, 1, BS:VEC_120/SCL_212
; EG-NEXT: MOV * T1.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 63(8.828180e-44)
; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, PS,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
; EG-NEXT: -12(nan), 0(0.000000e+00)
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -121,3 +491,5 @@ define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspac
store i32 %sel, ptr addrspace(1) %out.gep
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}