[NFC][AMDGPU] Update tests to use autogened CHECKs (#140311)
This commit is contained in:
parent
286ab11dc6
commit
437195efbf
@ -1,3 +1,4 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s
|
||||
|
||||
@ -10,21 +11,70 @@
|
||||
; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
|
||||
; instructions with B64, U64, and I64 take 64-bit operands.
|
||||
|
||||
; FUNC-LABEL: {{^}}local_address_load:
|
||||
; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
|
||||
define amdgpu_kernel void @local_address_load(ptr addrspace(1) %out, ptr addrspace(3) %in) {
|
||||
; GFX7-LABEL: local_address_load:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_load:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: ds_read_b32 v0, v0
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = load i32, ptr addrspace(3) %in
|
||||
store i32 %0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_address_gep:
|
||||
; SI: s_add_i32 [[SPTR:s[0-9]]]
|
||||
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; SI: ds_read_b32 [[VPTR]]
|
||||
define amdgpu_kernel void @local_address_gep(ptr addrspace(1) %out, ptr addrspace(3) %in, i32 %offset) {
|
||||
; GFX7-LABEL: local_address_gep:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_lshl_b32 s3, s3, 2
|
||||
; GFX7-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_gep:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 2
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: ds_read_b32 v0, v0
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = getelementptr i32, ptr addrspace(3) %in, i32 %offset
|
||||
%1 = load i32, ptr addrspace(3) %0
|
||||
@ -32,10 +82,34 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_address_gep_const_offset:
|
||||
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
|
||||
define amdgpu_kernel void @local_address_gep_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
|
||||
; GFX7-LABEL: local_address_gep_const_offset:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0 offset:4
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_gep_const_offset:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: ds_read_b32 v0, v0 offset:4
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = getelementptr i32, ptr addrspace(3) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(3) %0
|
||||
@ -44,11 +118,36 @@ entry:
|
||||
}
|
||||
|
||||
; Offset too large, can't fold into 16-bit immediate offset.
|
||||
; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
|
||||
; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
|
||||
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; SI: ds_read_b32 [[VPTR]]
|
||||
define amdgpu_kernel void @local_address_gep_large_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
|
||||
; GFX7-LABEL: local_address_gep_large_const_offset:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0xb
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_add_i32 s2, s2, 0x10004
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_gep_large_const_offset:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, 0x10004
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: ds_read_b32 v0, v0
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = getelementptr i32, ptr addrspace(3) %in, i32 16385
|
||||
%1 = load i32, ptr addrspace(3) %0
|
||||
@ -56,24 +155,70 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
|
||||
; GFX7 v_cmp_ne_u32
|
||||
; GFX7: s_cselect_b32
|
||||
; GFX8: s_cmp_lg_u32
|
||||
; GFX8-NOT: v_cmp_ne_u32
|
||||
; GFX8: s_cselect_b32
|
||||
define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind {
|
||||
; GFX7-LABEL: null_32bit_lds_ptr:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX7-NEXT: s_cselect_b32 s4, s4, 0x1c8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: null_32bit_lds_ptr:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s4, s4, 0x1c8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%cmp = icmp ne ptr addrspace(3) %lds, null
|
||||
%x = select i1 %cmp, i32 123, i32 456
|
||||
store i32 %x, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}mul_32bit_ptr:
|
||||
; SI: s_mul_i32
|
||||
; SI-NEXT: s_add_i32
|
||||
; SI: ds_read_b32
|
||||
define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %tid) {
|
||||
; GFX7-LABEL: mul_32bit_ptr:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mul_i32 s3, s3, 12
|
||||
; GFX7-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: mul_32bit_ptr:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_mul_i32 s3, s3, 12
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: ds_read_b32 v0, v0
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%ptr = getelementptr [3 x float], ptr addrspace(3) %lds, i32 %tid, i32 0
|
||||
%val = load float, ptr addrspace(3) %ptr
|
||||
store float %val, ptr addrspace(1) %out
|
||||
@ -82,60 +227,156 @@ define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
|
||||
@g_lds = addrspace(3) global float poison, align 4
|
||||
|
||||
; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
|
||||
; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
|
||||
define amdgpu_kernel void @infer_ptr_alignment_global_offset(ptr addrspace(1) %out, i32 %tid) {
|
||||
; GFX7-LABEL: infer_ptr_alignment_global_offset:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: infer_ptr_alignment_global_offset:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: ds_read_b32 v0, v0
|
||||
; GFX8-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX8-NEXT: s_mov_b32 s2, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%val = load float, ptr addrspace(3) @g_lds
|
||||
store float %val, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ptr = addrspace(3) global ptr addrspace(3) poison
|
||||
@dst = addrspace(3) global [16383 x i32] poison
|
||||
|
||||
; FUNC-LABEL: {{^}}global_ptr:
|
||||
; SI: ds_write_b32
|
||||
define amdgpu_kernel void @global_ptr() nounwind {
|
||||
; SI-LABEL: global_ptr:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 64
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 m0, -1
|
||||
; SI-NEXT: ds_write_b32 v1, v0 offset:65532
|
||||
; SI-NEXT: s_endpgm
|
||||
store ptr addrspace(3) getelementptr ([16383 x i32], ptr addrspace(3) @dst, i32 0, i32 16), ptr addrspace(3) @ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_address_store:
|
||||
; SI: ds_write_b32
|
||||
define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val) {
|
||||
; GFX7-LABEL: local_address_store:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: ds_write_b32 v0, v1
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_store:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: ds_write_b32 v0, v1
|
||||
; GFX8-NEXT: s_endpgm
|
||||
store i32 %val, ptr addrspace(3) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_address_gep_store:
|
||||
; SI: s_add_i32 [[SADDR:s[0-9]+]],
|
||||
; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
|
||||
; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
|
||||
define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
|
||||
; GFX7-LABEL: local_address_gep_store:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_add_i32 s0, s2, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b32 v1, v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_gep_store:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_add_i32 s0, s2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: ds_write_b32 v1, v0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%gep = getelementptr i32, ptr addrspace(3) %out, i32 %offset
|
||||
store i32 %val, ptr addrspace(3) %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
|
||||
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
|
||||
define amdgpu_kernel void @local_address_gep_const_offset_store(ptr addrspace(3) %out, i32 %val) {
|
||||
; GFX7-LABEL: local_address_gep_const_offset_store:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: ds_write_b32 v0, v1 offset:4
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_gep_const_offset_store:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: ds_write_b32 v0, v1 offset:4
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%gep = getelementptr i32, ptr addrspace(3) %out, i32 1
|
||||
store i32 %val, ptr addrspace(3) %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Offset too large, can't fold into 16-bit immediate offset.
|
||||
; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
|
||||
; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
|
||||
; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
|
||||
; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
|
||||
define amdgpu_kernel void @local_address_gep_large_const_offset_store(ptr addrspace(3) %out, i32 %val) {
|
||||
; GFX7-LABEL: local_address_gep_large_const_offset_store:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_add_i32 s0, s0, 0x10004
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b32 v1, v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: local_address_gep_large_const_offset_store:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, 0x10004
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: ds_write_b32 v1, v0
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%gep = getelementptr i32, ptr addrspace(3) %out, i32 16385
|
||||
store i32 %val, ptr addrspace(3) %gep, align 4
|
||||
ret void
|
||||
}
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; FUNC: {{.*}}
|
||||
|
@ -1,11 +1,17 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}select_and1:
|
||||
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
|
||||
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
|
||||
; GCN-NOT: v_and_b32
|
||||
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
; GCN-LABEL: select_and1:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_gt_i32 s2, 10
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, i32 0, i32 -1
|
||||
%a = and i32 %y, %s
|
||||
@ -13,12 +19,17 @@ define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_and2:
|
||||
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
|
||||
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
|
||||
; GCN-NOT: v_and_b32
|
||||
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
; GCN-LABEL: select_and2:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_gt_i32 s2, 10
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, i32 0, i32 -1
|
||||
%a = and i32 %s, %y
|
||||
@ -26,12 +37,17 @@ define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_and3:
|
||||
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
|
||||
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
|
||||
; GCN-NOT: v_and_b32
|
||||
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
; GCN-LABEL: select_and3:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lt_i32 s2, 11
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, i32 -1, i32 0
|
||||
%a = and i32 %y, %s
|
||||
@ -39,18 +55,25 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_and_v4:
|
||||
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, 0
|
||||
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, 0
|
||||
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, 0
|
||||
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, 0
|
||||
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
|
||||
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
|
||||
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
|
||||
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
|
||||
; GCN-NOT: v_and_b32
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
|
||||
define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
|
||||
; GCN-LABEL: select_and_v4:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_gt_i32 s8, 10
|
||||
; GCN-NEXT: s_cselect_b32 s3, s3, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s1, s1, 0
|
||||
; GCN-NEXT: s_cselect_b32 s0, s0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%a = and <4 x i32> %s, %y
|
||||
@ -58,12 +81,17 @@ define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32>
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_or1:
|
||||
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
|
||||
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
|
||||
; GCN-NOT: v_or_b32
|
||||
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
; GCN-LABEL: select_or1:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lt_i32 s2, 11
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, i32 0, i32 -1
|
||||
%a = or i32 %y, %s
|
||||
@ -71,12 +99,17 @@ define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_or2:
|
||||
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
|
||||
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
|
||||
; GCN-NOT: v_or_b32
|
||||
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
; GCN-LABEL: select_or2:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lt_i32 s2, 11
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, i32 0, i32 -1
|
||||
%a = or i32 %s, %y
|
||||
@ -84,12 +117,17 @@ define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_or3:
|
||||
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
|
||||
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
|
||||
; GCN-NOT: v_or_b32
|
||||
; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
; GCN-LABEL: select_or3:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_gt_i32 s2, 10
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, i32 -1, i32 0
|
||||
%a = or i32 %y, %s
|
||||
@ -97,18 +135,25 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}select_or_v4:
|
||||
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, -1
|
||||
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, -1
|
||||
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, -1
|
||||
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, -1
|
||||
; GCN-NOT: v_or_b32
|
||||
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
|
||||
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
|
||||
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
|
||||
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
|
||||
define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
|
||||
; GCN-LABEL: select_or_v4:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lt_i32 s8, 11
|
||||
; GCN-NEXT: s_cselect_b32 s3, s3, -1
|
||||
; GCN-NEXT: s_cselect_b32 s2, s2, -1
|
||||
; GCN-NEXT: s_cselect_b32 s1, s1, -1
|
||||
; GCN-NEXT: s_cselect_b32 s0, s0, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%c = icmp slt i32 %x, 11
|
||||
%s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
|
||||
%a = or <4 x i32> %s, %y
|
||||
@ -116,192 +161,360 @@ define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
|
||||
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sel_constants_sub_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 9, 2
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i32 -4, i32 3
|
||||
%bo = sub i32 5, %sel
|
||||
store i32 %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
|
||||
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 9, 2
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i16 -4, i16 3
|
||||
%bo = sub i16 5, %sel
|
||||
store i16 %bo, ptr addrspace(1) %p, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg:
|
||||
; GCN: s_cselect_b32 s[[SGPR:[0-9]+]], s[[SGPR]], 0xf449
|
||||
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16_neg:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_mov_b32 s2, 0xfffd
|
||||
; GCN-NEXT: s_cselect_b32 s2, s2, 0xf449
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i16 4, i16 3000
|
||||
%bo = sub i16 1, %sel
|
||||
store i16 %bo, ptr addrspace(1) %p, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v2i16:
|
||||
; GCN-DAG: s_mov_b32 [[T:s[0-9]+]], 0x50009
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, [[T]], 0x60002
|
||||
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sel_constants_sub_constant_sel_constants_v2i16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_mov_b32 s2, 0x50009
|
||||
; GCN-NEXT: s_cselect_b32 s2, s2, 0x60002
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, <2 x i16> <i16 -4, i16 2>, <2 x i16> <i16 3, i16 1>
|
||||
%bo = sub <2 x i16> <i16 5, i16 7>, %sel
|
||||
store <2 x i16> %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v4i32:
|
||||
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], 7, 14
|
||||
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 6, 10
|
||||
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 5, 6
|
||||
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 9, 2
|
||||
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
|
||||
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
|
||||
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
|
||||
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
|
||||
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 7, 14
|
||||
; GCN-NEXT: s_cselect_b32 s3, 6, 10
|
||||
; GCN-NEXT: s_cselect_b32 s4, 5, 6
|
||||
; GCN-NEXT: s_cselect_b32 s5, 9, 2
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
|
||||
%bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
|
||||
store <4 x i32> %bo, ptr addrspace(1) %p, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
|
||||
define amdgpu_kernel void @sdiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sdiv_constant_sel_constants_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 0, 5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i64 121, i64 23
|
||||
%bo = sdiv i64 120, %sel
|
||||
store i64 %bo, ptr addrspace(1) %p, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i32:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 26, 8
|
||||
define amdgpu_kernel void @sdiv_constant_sel_constants_i32(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: sdiv_constant_sel_constants_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 26, 8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i32 7, i32 23
|
||||
%bo = sdiv i32 184, %sel
|
||||
store i32 %bo, ptr addrspace(1) %p, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
|
||||
define amdgpu_kernel void @udiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: udiv_constant_sel_constants_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 0, 5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i64 -4, i64 23
|
||||
%bo = udiv i64 120, %sel
|
||||
store i64 %bo, ptr addrspace(1) %p, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}srem_constant_sel_constants:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
|
||||
define amdgpu_kernel void @srem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: srem_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 33, 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i64 34, i64 15
|
||||
%bo = srem i64 33, %sel
|
||||
store i64 %bo, ptr addrspace(1) %p, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}urem_constant_sel_constants:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
|
||||
define amdgpu_kernel void @urem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: urem_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 33, 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i64 34, i64 15
|
||||
%bo = urem i64 33, %sel
|
||||
store i64 %bo, ptr addrspace(1) %p, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}shl_constant_sel_constants:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 4, 8
|
||||
define amdgpu_kernel void @shl_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: shl_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 4, 8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i32 2, i32 3
|
||||
%bo = shl i32 1, %sel
|
||||
store i32 %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lshr_constant_sel_constants:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 16, 8
|
||||
define amdgpu_kernel void @lshr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: lshr_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 16, 8
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i32 2, i32 3
|
||||
%bo = lshr i32 64, %sel
|
||||
store i32 %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ashr_constant_sel_constants:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 32, 16
|
||||
define amdgpu_kernel void @ashr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: ashr_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 32, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, i32 2, i32 3
|
||||
%bo = ashr i32 128, %sel
|
||||
store i32 %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fsub_constant_sel_constants:
|
||||
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, -4.0, 1.0,
|
||||
define amdgpu_kernel void @fsub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: fsub_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, -4.0, 1.0, s[2:3]
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, float -2.0, float 3.0
|
||||
%bo = fsub float -1.0, %sel
|
||||
store float %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fsub_constant_sel_constants_f16:
|
||||
; TODO: it shall be possible to fold constants with OpSel
|
||||
; GCN-DAG: v_mov_b32_e32 [[T:v[0-9]+]], 0x3c00
|
||||
; GCN-DAG: v_mov_b32_e32 [[F:v[0-9]+]], 0xc400
|
||||
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, [[F]], [[T]],
|
||||
define amdgpu_kernel void @fsub_constant_sel_constants_f16(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: fsub_constant_sel_constants_f16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xc400
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0x3c00
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
|
||||
; GCN-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, half -2.0, half 3.0
|
||||
%bo = fsub half -1.0, %sel
|
||||
store half %bo, ptr addrspace(1) %p, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fsub_constant_sel_constants_v2f16:
|
||||
; GCN: s_cselect_b32 s{{[0-9]+}}, 0x45003c00, -2.0
|
||||
define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: fsub_constant_sel_constants_v2f16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, 0x45003c00, -2.0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, <2 x half> <half -2.0, half -3.0>, <2 x half> <half -1.0, half 4.0>
|
||||
%bo = fsub <2 x half> <half -1.0, half 2.0>, %sel
|
||||
store <2 x half> %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fsub_constant_sel_constants_v4f32:
|
||||
; GCN: s_mov_b32 [[T0:s[0-9]+]], 0x41500000
|
||||
; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], [[T0]], 0x40c00000
|
||||
; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 0x41100000, 4.0
|
||||
; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 0x40a00000, 2.0
|
||||
; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 1.0, 0
|
||||
; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
|
||||
; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
|
||||
; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
|
||||
; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
|
||||
; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
|
||||
define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: fsub_constant_sel_constants_v4f32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: s_mov_b32 s3, 0x41500000
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s2, s3, 0x40c00000
|
||||
; GCN-NEXT: s_cselect_b32 s3, 0x41100000, 4.0
|
||||
; GCN-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0
|
||||
; GCN-NEXT: s_cselect_b32 s5, 1.0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
|
||||
%bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel
|
||||
store <4 x float> %bo, ptr addrspace(1) %p, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fdiv_constant_sel_constants:
|
||||
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 4.0, -2.0,
|
||||
define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: fdiv_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, 4.0, -2.0, s[2:3]
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, float -4.0, float 2.0
|
||||
%bo = fdiv float 8.0, %sel
|
||||
store float %bo, ptr addrspace(1) %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}frem_constant_sel_constants:
|
||||
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0,
|
||||
define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
|
||||
; GCN-LABEL: frem_constant_sel_constants:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_bitcmp1_b32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[2:3]
|
||||
; GCN-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%sel = select i1 %cond, float -4.0, float 3.0
|
||||
%bo = frem float 5.0, %sel
|
||||
store float %bo, ptr addrspace(1) %p, align 4
|
||||
|
@ -1,3 +1,4 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
|
||||
|
||||
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
|
||||
@ -6,30 +7,40 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
declare float @llvm.fabs.f32(float) #1
|
||||
declare double @llvm.fabs.f64(double) #1
|
||||
|
||||
; SI-LABEL: {{^}}test_class_f32:
|
||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
|
||||
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: v_cmp_class_f32_e32 vcc, s7, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_fabs_f32:
|
||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_fabs_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
|
||||
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], |s7|, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%a.fabs = call float @llvm.fabs.f32(float %a) #1
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
@ -37,15 +48,20 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32],
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_fneg_f32:
|
||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_fneg_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
|
||||
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -s7, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%a.fneg = fsub float -0.0, %a
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
@ -53,15 +69,20 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32],
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
|
||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_fneg_fabs_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x1c
|
||||
; SI-NEXT: s_load_dword s7, s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -|s7|, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%a.fabs = call float @llvm.fabs.f32(float %a) #1
|
||||
%a.fneg.fabs = fsub float -0.0, %a.fabs
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
|
||||
@ -70,26 +91,36 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_1_f32:
|
||||
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 {
|
||||
; SI-LABEL: test_class_1_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 1
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_64_f32:
|
||||
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 {
|
||||
; SI-LABEL: test_class_64_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 64
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
@ -97,42 +128,62 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0
|
||||
}
|
||||
|
||||
; Set all 10 bits of mask
|
||||
; SI-LABEL: {{^}}test_class_full_mask_f32:
|
||||
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
|
||||
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 {
|
||||
; SI-LABEL: test_class_full_mask_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x3ff
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_9bit_mask_f32:
|
||||
; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 {
|
||||
; SI-LABEL: test_class_9bit_mask_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x1ff
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}v_test_class_full_mask_f32:
|
||||
; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
|
||||
; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: v_test_class_full_mask_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_movk_i32 s4, 0x1ff
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v2, s4
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -144,13 +195,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
|
||||
; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
|
||||
; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e32 vcc, 1.0, v2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -163,14 +224,24 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr a
|
||||
}
|
||||
|
||||
; FIXME: Why isn't this using a literal constant operand?
|
||||
; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
|
||||
; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
|
||||
; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000
|
||||
; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
|
||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_class_lit_constant_dynamic_mask_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s4, 0x44800000
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -182,30 +253,40 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspac
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_f64:
|
||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_fabs_f64:
|
||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_fabs_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], |s[6:7]|, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%a.fabs = call double @llvm.fabs.f64(double %a) #1
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
@ -213,15 +294,20 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32],
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_fneg_f64:
|
||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_fneg_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -s[6:7], v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%a.fneg = fsub double -0.0, %a
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
@ -229,15 +315,20 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32],
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
|
||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_class_fneg_fabs_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s8, s[4:5], 0x1d
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -|s[6:7]|, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%a.fabs = call double @llvm.fabs.f64(double %a) #1
|
||||
%a.fneg.fabs = fsub double -0.0, %a.fabs
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
|
||||
@ -246,20 +337,38 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_1_f64:
|
||||
; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 {
|
||||
; SI-LABEL: test_class_1_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 1
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_64_f64:
|
||||
; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #0 {
|
||||
; SI-LABEL: test_class_64_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 64
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
@ -267,30 +376,45 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #
|
||||
}
|
||||
|
||||
; Set all 9 bits of mask
|
||||
; SI-LABEL: {{^}}test_class_full_mask_f64:
|
||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
|
||||
; SI-NOT: vcc
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 {
|
||||
; SI-LABEL: test_class_full_mask_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x1ff
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}v_test_class_full_mask_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
|
||||
; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
|
||||
; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
|
||||
; SI-NOT: vcc
|
||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: v_test_class_full_mask_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s2
|
||||
; SI-NEXT: s_mov_b32 s5, s3
|
||||
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_movk_i32 s4, 0x1ff
|
||||
; SI-NEXT: s_mov_b32 s2, 0
|
||||
; SI-NEXT: s_mov_b32 s3, s7
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], s4
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -302,11 +426,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
|
||||
; XSI: v_cmp_class_f64_e32 vcc, 1.0,
|
||||
; SI: v_cmp_class_f64_e32 vcc,
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f64_e32 vcc, 1.0, v2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -318,10 +454,25 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
|
||||
; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_class_lit_constant_dynamic_mask_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s4, 0
|
||||
; SI-NEXT: s_mov_b32 s5, 0x40900000
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -333,12 +484,26 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspac
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_or_class_f32_0:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_fold_or_class_f32_0:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 3
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -353,12 +518,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_fold_or3_class_f32_0:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -375,13 +554,27 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}}
|
||||
; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_fold_or_all_tests_class_f32_0:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_movk_i32 s2, 0x3ff
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s2
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -411,12 +604,26 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_or_class_f32_1:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_fold_or_class_f32_1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 12
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -431,12 +638,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_or_class_f32_2:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-LABEL: test_fold_or_class_f32_2:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -451,12 +672,29 @@ define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
|
||||
; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
|
||||
; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
|
||||
; SI: s_or_b64
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) #0 {
|
||||
; SI-LABEL: test_no_fold_or_class_f32_0:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dword s12, s[4:5], 0xd
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s12, 8
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[2:3], v0, 4
|
||||
; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
|
||||
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -471,72 +709,94 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_0_f32:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 {
|
||||
; SI-LABEL: test_class_0_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_0_f64:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 {
|
||||
; SI-LABEL: test_class_0_f64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_class_undef_f32:
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, i32 %b) #0 {
|
||||
; SI-LABEL: test_class_undef_f32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
%result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1
|
||||
%sext = sext i1 %result to i32
|
||||
store i32 %sext, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_and_ord:
|
||||
; SI: s_waitcnt
|
||||
; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
|
||||
; SI-NEXT: s_setpc_b64
|
||||
define i1 @test_fold_and_ord(float %a) {
|
||||
; SI-LABEL: test_fold_and_ord:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 32
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
|
||||
%ord = fcmp ord float %a, %a
|
||||
%and = and i1 %ord, %class
|
||||
ret i1 %and
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_and_unord:
|
||||
; SI: s_waitcnt
|
||||
; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
|
||||
; SI-NEXT: s_setpc_b64
|
||||
define i1 @test_fold_and_unord(float %a) {
|
||||
; SI-LABEL: test_fold_and_unord:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 3
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
|
||||
%ord = fcmp uno float %a, %a
|
||||
%and = and i1 %ord, %class
|
||||
ret i1 %and
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}test_fold_and_ord_multi_use:
|
||||
; SI: v_cmp_class
|
||||
; SI-NOT: v_cmp_class
|
||||
; SI: v_cmp_o
|
||||
; SI: s_and_b64
|
||||
define i1 @test_fold_and_ord_multi_use(float %a) {
|
||||
; SI-LABEL: test_fold_and_ord_multi_use:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 35
|
||||
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
|
||||
; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
|
||||
store volatile i1 %class, ptr addrspace(1) poison
|
||||
%ord = fcmp ord float %a, %a
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,4 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s
|
||||
@ -5,162 +6,766 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=R600,FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_f32:
|
||||
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
|
||||
; GCN-HSA: flat_load_dword
|
||||
|
||||
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
|
||||
define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load float, ptr addrspace(1) %in
|
||||
store float %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v2f32:
|
||||
; GCN-NOHSA: buffer_load_dwordx2
|
||||
; GCN-HSA: flat_load_dwordx2
|
||||
|
||||
; R600: VTX_READ_64
|
||||
define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v2f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v2f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v2f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <2 x float>, ptr addrspace(1) %in
|
||||
store <2 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v3f32:
|
||||
; SI-NOHSA: buffer_load_dwordx4
|
||||
; GCNX3-NOHSA: buffer_load_dwordx3
|
||||
; GCNX3-HSA: flat_load_dwordx3
|
||||
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v3f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v3f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v3f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <3 x float>, ptr addrspace(1) %in
|
||||
store <3 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v4f32:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v4f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v4f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v4f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <4 x float>, ptr addrspace(1) %in
|
||||
store <4 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v8f32:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v8f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v8f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v8f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <8 x float>, ptr addrspace(1) %in
|
||||
store <8 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v9f32:
|
||||
; GCN-NOHSA: buffer_load_dword
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dword
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_32
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v9f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v9f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s2, 32
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: flat_load_dword v14, v[6:7]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dword v[10:11], v14
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v9f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <9 x float>, ptr addrspace(1) %in
|
||||
store <9 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v10f32:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx2
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx2
|
||||
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v10f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v10f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx2 v[14:15], v[8:9]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v10f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <10 x float>, ptr addrspace(1) %in
|
||||
store <10 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v11f32:
|
||||
; SI-NOHSA: buffer_load_dwordx4
|
||||
; SI-NOHSA: buffer_load_dwordx4
|
||||
; SI-NOHSA: buffer_load_dwordx4
|
||||
; GCNX3-NOHSA: buffer_load_dwordx4
|
||||
; GCNX3-NOHSA: buffer_load_dwordx4
|
||||
; GCNX3-NOHSA: buffer_load_dwordx3
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx3
|
||||
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v11f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 offset:16
|
||||
; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:40
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v11f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9]
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx3 v[15:16], v[8:10]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v11f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <11 x float>, ptr addrspace(1) %in
|
||||
store <11 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v12f32:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v12f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v12f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v12f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <12 x float>, ptr addrspace(1) %in
|
||||
store <12 x float> %tmp0, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}global_load_v16f32:
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
; GCN-NOHSA: buffer_load_dwordx4
|
||||
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
; GCN-HSA: flat_load_dwordx4
|
||||
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; SI-NOHSA-LABEL: global_load_v16f32:
|
||||
; SI-NOHSA: ; %bb.0: ; %entry
|
||||
; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
|
||||
; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
|
||||
; SI-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
|
||||
; SI-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_load_v16f32:
|
||||
; GCN-HSA: ; %bb.0: ; %entry
|
||||
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
||||
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GCN-HSA-NEXT: s_add_u32 s6, s2, 48
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s2, 32
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
|
||||
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
|
||||
; GCN-HSA-NEXT: s_nop 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCNX3-NOHSA-LABEL: global_load_v16f32:
|
||||
; GCNX3-NOHSA: ; %bb.0: ; %entry
|
||||
; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
|
||||
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
|
||||
; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
|
||||
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
|
||||
; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
|
||||
; GCNX3-NOHSA-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = load <16 x float>, ptr addrspace(1) %in
|
||||
store <16 x float> %tmp0, ptr addrspace(1) %out
|
||||
@ -168,3 +773,8 @@ entry:
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; FUNC: {{.*}}
|
||||
; GCN-NOHSA: {{.*}}
|
||||
; GCNX3-HSA: {{.*}}
|
||||
; R600: {{.*}}
|
||||
|
@ -1,15 +1,29 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; FIXME: Fails with -enable-var-scope
|
||||
|
||||
; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
|
||||
|
||||
; Extract the high bit of the low half
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_31_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_31_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 31, v2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -21,13 +35,24 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addr
|
||||
}
|
||||
|
||||
; Extract the high bit of the high half
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
|
||||
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
|
||||
; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO1]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_63_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 31, v2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -38,12 +63,25 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_1_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_1_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 1
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -54,12 +92,25 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrs
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_20_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_20_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 20, 1
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -70,13 +121,24 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
|
||||
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_32_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -87,13 +149,24 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
|
||||
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_33_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 1
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -104,12 +177,25 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_20_21_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 20, 2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -120,12 +206,25 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_1_30_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 30
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -136,12 +235,25 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr ad
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_1_31_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -154,13 +266,26 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr ad
|
||||
|
||||
; Spans the dword boundary, so requires full shift.
|
||||
; Truncated after the shift, so only low shift result is used.
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
|
||||
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
|
||||
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
|
||||
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_31_32_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: s_ashr_i32 s3, s2, 31
|
||||
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 31
|
||||
; GCN-NEXT: v_and_b32_e32 v2, 3, v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -171,13 +296,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
|
||||
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_32_33_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -188,14 +324,24 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
|
||||
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30
|
||||
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
|
||||
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_30_60_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 30
|
||||
; GCN-NEXT: v_and_b32_e32 v2, 0x3fffffff, v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -206,13 +352,24 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
|
||||
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_33_63_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 1, 30
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -223,12 +380,25 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr a
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
|
||||
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
|
||||
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
|
||||
; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_31_63_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_mov_b32 s11, s7
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_alignbit_b32 v0, v3, v2, 31
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -240,11 +410,23 @@ define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr a
|
||||
}
|
||||
|
||||
; trunc applied before and mask
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
|
||||
; GCN: buffer_store_dword v[[SHIFT]]
|
||||
define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_31_i64_trunc_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v3
|
||||
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -256,11 +438,23 @@ define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
|
||||
; GCN: buffer_store_dword [[BFE]]
|
||||
define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_3_i64_trunc_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v0, v3, 3, 1
|
||||
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -272,11 +466,24 @@ define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out,
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
|
||||
; GCN: buffer_store_dword [[BFE]]
|
||||
define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_33_i64_trunc_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v0, v3, 1, 1
|
||||
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -288,13 +495,24 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
|
||||
; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
|
||||
; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
|
||||
; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
|
||||
; GCN-NOT: v[[SHRLO]]
|
||||
; GCN: buffer_store_dword v[[SHRLO]]
|
||||
define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_31_32_i64_trunc_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_alignbit_b32 v0, v4, v3, 31
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -306,16 +524,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_not_mask_i64:
|
||||
; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
|
||||
; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]]
|
||||
; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
|
||||
; GCN-NOT: v[[SHRLO]]
|
||||
; GCN-NOT: v[[SHRHI]]
|
||||
; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
|
||||
define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: and_not_mask_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_and_b32_e32 v2, 4, v2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -328,15 +554,29 @@ define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace
|
||||
|
||||
; The instruction count is the same with/without hasOneUse, but
|
||||
; keeping the 32-bit and has a smaller encoding size than the bfe.
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
|
||||
; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
|
||||
; GCN-DAG: v_lshr_b64 v[[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]], [[VAL]], 27
|
||||
; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
|
||||
; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_27_29_multi_use_shift_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_mov_b32 s11, s7
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], 27
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 3, v2
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -348,15 +588,30 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspac
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
|
||||
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
|
||||
; GCN-DAG: buffer_store_dwordx2 v[[[SHR]]:[[ZERO_SHR]]]
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO_BFE]]]
|
||||
define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_34_37_multi_use_shift_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-NEXT: s_mov_b32 s11, s7
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v2
|
||||
; GCN-NEXT: v_bfe_u32 v2, v2, 2, 3
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
|
||||
@ -368,13 +623,32 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspac
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
|
||||
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v[[[BFE]]:{{[0-9]+\]}}
|
||||
; GCN: buffer_store_dword v[[ZERO]]
|
||||
define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #1 {
|
||||
; GCN-LABEL: v_uextract_bit_33_36_use_upper_half_shift_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
|
||||
; GCN-NEXT: s_mov_b32 s2, 0
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_load_dword v5, v[1:2], s[4:7], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v2
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, v2
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_u32 v5, v5, 1, 3
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[5:6], v[1:2], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_store_dword v2, v[3:4], s[8:11], 0 addr64
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
|
||||
%out0.gep = getelementptr i64, ptr addrspace(1) %out0, i32 %id.x
|
||||
|
@ -1,51 +1,150 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
|
||||
define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) {
|
||||
; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
|
||||
; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[4:5],
|
||||
; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
|
||||
; SI: buffer_store_dword [[VLOAD]]
|
||||
; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]]
|
||||
|
||||
; EG-LABEL: {{^}}trunc_i64_to_i32_store:
|
||||
; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG: LSHR
|
||||
; EG-NEXT: 2(
|
||||
|
||||
; SI-LABEL: trunc_i64_to_i32_store:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: trunc_i64_to_i32_store:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[4:5], 0x4c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: trunc_i64_to_i32_store:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MOV * T1.X, KC0[4].W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%result = trunc i64 %in to i32 store i32 %result, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}trunc_load_shl_i64:
|
||||
; GCN-DAG: s_load_dwordx2
|
||||
; GCN-DAG: s_load_dword [[SREG:s[0-9]+]],
|
||||
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
|
||||
; GCN: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
|
||||
; SI: buffer_store_dword [[VSHL]]
|
||||
; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
|
||||
|
||||
define amdgpu_kernel void @trunc_load_shl_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
|
||||
; SI-LABEL: trunc_load_shl_i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_lshl_b32 s4, s6, 2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: trunc_load_shl_i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dword s2, s[4:5], 0x4c
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshl_b32 s2, s2, 2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: trunc_load_shl_i64:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: LSHL * T1.X, KC0[4].W, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%b = shl i64 %a, 2
|
||||
%result = trunc i64 %b to i32
|
||||
store i32 %result, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}trunc_shl_i64:
|
||||
; SI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; VI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||
; GCN: s_lshl_b64 s[[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s[[[LO_SREG]]:{{[0-9]+\]}}, 2
|
||||
; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
|
||||
; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
|
||||
; SI: buffer_store_dword v[[LO_VREG]],
|
||||
; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
|
||||
; GCN: v_mov_b32_e32
|
||||
; GCN: v_mov_b32_e32
|
||||
define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1) %out, i64 %a) {
|
||||
; SI-LABEL: trunc_shl_i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
|
||||
; SI-NEXT: s_add_u32 s8, s8, 0x3a8
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_mov_b32 s0, s2
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_mov_b32 s2, s6
|
||||
; SI-NEXT: s_mov_b32 s3, s7
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: s_addc_u32 s9, s9, 0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: trunc_shl_i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], 2
|
||||
; VI-NEXT: s_add_u32 s0, s0, 0x3a8
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: flat_store_dword v[2:3], v4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: trunc_shl_i64:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[2].W, literal.x,
|
||||
; EG-NEXT: ADDC_UINT * T2.W, PV.W, literal.y,
|
||||
; EG-NEXT: 30(4.203895e-44), 936(1.311615e-42)
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: ADD_INT T1.Y, PV.W, PS,
|
||||
; EG-NEXT: ADD_INT * T1.X, T0.W, literal.y,
|
||||
; EG-NEXT: 2(2.802597e-45), 936(1.311615e-42)
|
||||
; EG-NEXT: LSHR * T2.X, KC0[2].Z, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%aa = add i64 %a, 234 ; Prevent shrinking store.
|
||||
%b = shl i64 %aa, 2
|
||||
%result = trunc i64 %b to i32
|
||||
@ -54,9 +153,55 @@ define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}trunc_i32_to_i1:
|
||||
; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
|
||||
; SI-LABEL: trunc_i32_to_i1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: trunc_i32_to_i1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v2, 1, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: trunc_i32_to_i1:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: TEX 0 @6
|
||||
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: Fetch clause starting at 6:
|
||||
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
|
||||
; EG-NEXT: ALU clause starting at 8:
|
||||
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
||||
; EG-NEXT: ALU clause starting at 9:
|
||||
; EG-NEXT: AND_INT T0.X, T0.X, 1,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%a = load i32, ptr addrspace(1) %ptr, align 4
|
||||
%trunc = trunc i32 %a to i1
|
||||
%result = select i1 %trunc, i32 1, i32 0
|
||||
@ -64,9 +209,64 @@ define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}trunc_i8_to_i1:
|
||||
; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
|
||||
; SI-LABEL: trunc_i8_to_i1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s10, s6
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s2
|
||||
; SI-NEXT: s_mov_b32 s9, s3
|
||||
; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: trunc_i8_to_i1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_load_ubyte v2, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v2, 1, v2
|
||||
; VI-NEXT: flat_store_byte v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: trunc_i8_to_i1:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: TEX 0 @6
|
||||
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: Fetch clause starting at 6:
|
||||
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
||||
; EG-NEXT: ALU clause starting at 8:
|
||||
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
||||
; EG-NEXT: ALU clause starting at 9:
|
||||
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: AND_INT * T1.W, T0.X, 1,
|
||||
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
|
||||
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
|
||||
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
||||
; EG-NEXT: MOV T0.Y, 0.0,
|
||||
; EG-NEXT: MOV * T0.Z, 0.0,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%a = load i8, ptr addrspace(1) %ptr, align 4
|
||||
%trunc = trunc i8 %a to i1
|
||||
%result = select i1 %trunc, i8 1, i8 0
|
||||
@ -74,43 +274,213 @@ define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
define amdgpu_kernel void @sgpr_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %a) {
|
||||
; SI-LABEL: sgpr_trunc_i16_to_i1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s4, s6, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: sgpr_trunc_i16_to_i1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s2, s2, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: flat_store_short v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: sgpr_trunc_i16_to_i1:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
|
||||
; EG-NEXT: TEX 0 @6
|
||||
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: Fetch clause starting at 6:
|
||||
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
|
||||
; EG-NEXT: ALU clause starting at 8:
|
||||
; EG-NEXT: MOV * T0.X, 0.0,
|
||||
; EG-NEXT: ALU clause starting at 9:
|
||||
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: AND_INT * T1.W, T0.X, 1,
|
||||
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
|
||||
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
|
||||
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
||||
; EG-NEXT: MOV T0.Y, 0.0,
|
||||
; EG-NEXT: MOV * T0.Z, 0.0,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%trunc = trunc i16 %a to i1
|
||||
%result = select i1 %trunc, i16 1, i16 0
|
||||
store i16 %result, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
|
||||
; SI-LABEL: sgpr_trunc_i32_to_i1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s4, s6, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: sgpr_trunc_i32_to_i1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s2, s2, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: sgpr_trunc_i32_to_i1:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: AND_INT * T1.X, KC0[2].Z, 1,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%trunc = trunc i32 %a to i1
|
||||
%result = select i1 %trunc, i32 1, i32 0
|
||||
store i32 %result, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
|
||||
; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||
; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||
; GCN: s_bitcmp1_b32 s[[SLO]], 0
|
||||
; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
|
||||
define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
|
||||
; SI-LABEL: s_trunc_i64_to_i1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_bitcmp1_b32 s6, 0
|
||||
; SI-NEXT: s_cselect_b32 s4, 63, -12
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_trunc_i64_to_i1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
|
||||
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_bitcmp1_b32 s0, 0
|
||||
; VI-NEXT: s_cselect_b32 s0, 63, -12
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: s_trunc_i64_to_i1:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: MOV T0.W, literal.x,
|
||||
; EG-NEXT: AND_INT * T1.W, KC0[4].W, 1,
|
||||
; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: CNDE_INT T0.X, PS, literal.x, PV.W,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
||||
; EG-NEXT: -12(nan), 2(2.802597e-45)
|
||||
%trunc = trunc i64 %x to i1
|
||||
%sel = select i1 %trunc, i32 63, i32 -12
|
||||
store i32 %sel, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_trunc_i64_to_i1:
|
||||
; SI: buffer_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
|
||||
; VI: flat_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
|
||||
; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
|
||||
; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
|
||||
define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
||||
; SI-LABEL: v_trunc_i64_to_i1:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v2, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_and_b32_e32 v0, 1, v3
|
||||
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, -12, 63, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_trunc_i64_to_i1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s3
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
|
||||
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
|
||||
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: v_and_b32_e32 v0, 1, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, -12, 63, vcc
|
||||
; VI-NEXT: flat_store_dword v[2:3], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: v_trunc_i64_to_i1:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: TEX 0 @6
|
||||
; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: Fetch clause starting at 6:
|
||||
; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
|
||||
; EG-NEXT: ALU clause starting at 8:
|
||||
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
|
||||
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
|
||||
; EG-NEXT: ALU clause starting at 11:
|
||||
; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
|
||||
; EG-NEXT: AND_INT T0.W, T1.X, 1, BS:VEC_120/SCL_212
|
||||
; EG-NEXT: MOV * T1.W, literal.y,
|
||||
; EG-NEXT: 2(2.802597e-45), 63(8.828180e-44)
|
||||
; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, PS,
|
||||
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
|
||||
; EG-NEXT: -12(nan), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
%gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
|
||||
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
|
||||
@ -121,3 +491,5 @@ define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspac
|
||||
store i32 %sel, ptr addrspace(1) %out.gep
|
||||
ret void
|
||||
}
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GCN: {{.*}}
|
||||
|
Loading…
x
Reference in New Issue
Block a user