
For most targets, the register class comes from the type so this makes no difference. For AMDGPU, the selected register class depends on the divergence of the value. For a constant phi input, this will always be false. The heuristic for whether to treat the value as a scalar or vector constant based on the uses would then incorrectly think this is a scalar use, when really the phi is a copy from S to V. This avoids an intermediate s_mov_b32 plus a copy in some cases. These would often, but not always, fold out in mi passes. This only adjusts the constant input case. It may make sense to do this for the non-constant case as well.
70 lines
2.9 KiB
LLVM
70 lines
2.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s
|
|
|
|
; Make sure we match the addressing mode offset of csub intrinsics across blocks.
|
|
|
|
define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
|
; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
|
|
; OPT-NEXT: entry:
|
|
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
|
|
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
|
|
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
|
|
; OPT: if:
|
|
; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 7
|
|
; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) [[IN_GEP]], i32 2)
|
|
; OPT-NEXT: br label [[ENDIF]]
|
|
; OPT: endif:
|
|
; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 999999
|
|
; OPT-NEXT: store i32 [[X]], ptr addrspace(1) [[OUT_GEP]], align 4
|
|
; OPT-NEXT: br label [[DONE:%.*]]
|
|
; OPT: done:
|
|
; OPT-NEXT: ret void
|
|
;
|
|
; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32:
|
|
; GCN: ; %bb.0: ; %entry
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-NEXT: s_mov_b32 s4, exec_lo
|
|
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v1
|
|
; GCN-NEXT: s_cbranch_execz .LBB0_2
|
|
; GCN-NEXT: ; %bb.1: ; %if
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 2
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
|
|
; GCN-NEXT: .LBB0_2: ; %endif
|
|
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252
|
|
; GCN-NEXT: s_endpgm
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
|
%cmp = icmp eq i32 %tid, 0
|
|
br i1 %cmp, label %endif, label %if
|
|
|
|
if:
|
|
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 7
|
|
%val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %in.gep, i32 2)
|
|
br label %endif
|
|
|
|
endif:
|
|
%x = phi i32 [ %val, %if ], [ 0, %entry ]
|
|
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 999999
|
|
store i32 %x, ptr addrspace(1) %out.gep
|
|
br label %done
|
|
|
|
done:
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #0
|
|
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
|
|
|
|
attributes #0 = { argmemonly nounwind }
|
|
attributes #1 = { nounwind readnone willreturn }
|
|
attributes #2 = { argmemonly nounwind willreturn }
|