[AMDGPU][UniformityAnalysis] Mark set_inactive and set_inactive_chain_arg as SourceOfDivergence (#190640)

`set_inactive` produces a result that varies per-lane based on the EXEC mask, even when both inputs are uniform.
2026-04-06 12:40:22 -07:00 · 2026-04-06 12:40:22 -07:00 · 40d5a7d69e
commit 40d5a7d69e
parent 326593b4b4
3 changed files with 24 additions and 6 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@ -334,6 +334,8 @@ def : SourceOfDivergence<int_amdgcn_writelane>;
 def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
 def : SourceOfDivergence<int_amdgcn_permlane16_swap>;
 def : SourceOfDivergence<int_amdgcn_permlane32_swap>;
+def : SourceOfDivergence<int_amdgcn_set_inactive>;
+def : SourceOfDivergence<int_amdgcn_set_inactive_chain_arg>;

 foreach intr = AMDGPUMFMAIntrinsics908 in
 def : SourceOfDivergence<intr>;
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@ -846,5 +846,19 @@ define amdgpu_cs void @call_whole_wave(ptr addrspace(1) %out) {

 declare amdgpu_gfx_whole_wave i32 @wwf(i1, i32) #0

+; CHECK: DIVERGENT: %set_inactive = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 0)
+define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) #0 {
+  %set_inactive = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 0) #0
+  store i32 %set_inactive, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: DIVERGENT: %set_inactive = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive)
+define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %inactive, i32 %active) #0 {
+  %set_inactive = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
+  store i32 %set_inactive, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 attributes #0 = { nounwind convergent }
 attributes #1 = { nounwind readnone convergent }
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@ -21,12 +21,13 @@ define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
 ; GCN-NEXT:  ; %bb.1: ; %bb42
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:  .LBB0_2: ; %bb602
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v1
+; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-NEXT:  ; %bb.3: ; %bb49
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GCN-NEXT:    tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc
-; GCN-NEXT:  .LBB0_4: ; %bb54
+; GCN-NEXT:  .LBB0_4: ; %UnifiedReturnBlock
 ; GCN-NEXT:    s_endpgm
 entry:
  br label %work
@ -75,12 +76,13 @@ define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer
 ; GCN-NEXT:  ; %bb.1: ; %bb42
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:  .LBB1_2: ; %bb602
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
-; GCN-NEXT:    s_cbranch_vccnz .LBB1_4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v1
+; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-NEXT:  ; %bb.3: ; %bb49
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GCN-NEXT:    tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc
-; GCN-NEXT:  .LBB1_4: ; %bb54
+; GCN-NEXT:  .LBB1_4: ; %UnifiedReturnBlock
 ; GCN-NEXT:    s_endpgm
 entry:
  br label %work