[AMDGPU][UniformityAnalysis] Mark set_inactive and set_inactive_chain_arg as SourceOfDivergence (#190640)

`set_inactive` produces a result that varies per-lane based on the EXEC mask, even when both inputs are uniform.
This commit is contained in:
Chinmay Deshpande 2026-04-06 12:40:22 -07:00 committed by GitHub
parent 326593b4b4
commit 40d5a7d69e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 6 deletions

View File

@ -334,6 +334,8 @@ def : SourceOfDivergence<int_amdgcn_writelane>;
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
def : SourceOfDivergence<int_amdgcn_permlane16_swap>;
def : SourceOfDivergence<int_amdgcn_permlane32_swap>;
def : SourceOfDivergence<int_amdgcn_set_inactive>;
def : SourceOfDivergence<int_amdgcn_set_inactive_chain_arg>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;

View File

@ -846,5 +846,19 @@ define amdgpu_cs void @call_whole_wave(ptr addrspace(1) %out) {
declare amdgpu_gfx_whole_wave i32 @wwf(i1, i32) #0
; CHECK: DIVERGENT: %set_inactive = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 0)
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) #0 {
%set_inactive = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 0) #0
store i32 %set_inactive, ptr addrspace(1) %out, align 4
ret void
}
; CHECK: DIVERGENT: %set_inactive = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive)
define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %inactive, i32 %active) #0 {
%set_inactive = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
store i32 %set_inactive, ptr addrspace(1) %out, align 4
ret void
}
attributes #0 = { nounwind convergent }
attributes #1 = { nounwind readnone convergent }

View File

@ -21,12 +21,13 @@ define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
; GCN-NEXT: ; %bb.1: ; %bb42
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: .LBB0_2: ; %bb602
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GCN-NEXT: s_cbranch_vccnz .LBB0_4
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_4
; GCN-NEXT: ; %bb.3: ; %bb49
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-NEXT: tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc
; GCN-NEXT: .LBB0_4: ; %bb54
; GCN-NEXT: .LBB0_4: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
entry:
br label %work
@ -75,12 +76,13 @@ define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer
; GCN-NEXT: ; %bb.1: ; %bb42
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: .LBB1_2: ; %bb602
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GCN-NEXT: s_cbranch_vccnz .LBB1_4
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1
; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GCN-NEXT: s_cbranch_execz .LBB1_4
; GCN-NEXT: ; %bb.3: ; %bb49
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-NEXT: tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc
; GCN-NEXT: .LBB1_4: ; %bb54
; GCN-NEXT: .LBB1_4: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
entry:
br label %work