Cover all the missing cases and add very detailed tests for each rule. In summary: - Flat and Scratch, addrspace(0) and addrspace(5), loads are always divergent. - Global and Constant, addrspace(1) and addrspace(4), have real uniform loads, s_load, but require additional checks for align and flags in mmo. For not natural align or not uniform mmo do uniform-in-vgpr lowering. - Private, addrspace(3), only has instructions for divergent load, for uniform do uniform-in-vgpr lowering. - Store rules are simplified using Ptr32 and Ptr64. All operands need to be vgpr. Some tests have code size regression since they use more sgpr instructions, marked with FixMe comment to get back to later.
40 lines
1.1 KiB
LLVM
40 lines
1.1 KiB
LLVM
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
|
|
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
|
|
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
|
|
|
|
; This test makes sure we do not double count global values when they are
|
|
; used in different basic blocks.
|
|
|
|
; GCN: .long 47180
|
|
; GCN-NEXT: .long 32900
|
|
|
|
; EG: .long 166120
|
|
; EG-NEXT: .long 1
|
|
; ALL: {{^}}test:
|
|
|
|
; HSA-NOT: COMPUTE_PGM_RSRC2.LDS_SIZE
|
|
; HSA: .amdhsa_group_segment_fixed_size 4
|
|
|
|
; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
|
|
@lds = internal unnamed_addr addrspace(3) global i32 poison, align 4
|
|
|
|
define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %cond) {
|
|
entry:
|
|
%0 = icmp eq i32 %cond, 0
|
|
br i1 %0, label %if, label %else
|
|
|
|
if:
|
|
store i32 1, ptr addrspace(3) @lds
|
|
br label %endif
|
|
|
|
else:
|
|
store i32 2, ptr addrspace(3) @lds
|
|
br label %endif
|
|
|
|
endif:
|
|
ret void
|
|
}
|
|
|
|
!llvm.module.flags = !{!0}
|
|
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
|