From 40d5a7d69e957750a2d45ae9dd68f486b5306f77 Mon Sep 17 00:00:00 2001 From: Chinmay Deshpande Date: Mon, 6 Apr 2026 12:40:22 -0700 Subject: [PATCH] [AMDGPU][UniformityAnalysis] Mark set_inactive and set_inactive_chain_arg as SourceOfDivergence (#190640) `set_inactive` produces a result that varies per-lane based on the EXEC mask, even when both inputs are uniform. --- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td | 2 ++ .../UniformityAnalysis/AMDGPU/intrinsics.ll | 14 ++++++++++++++ llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll | 14 ++++++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 13bd7a5addd7..4cb200cd51e5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -334,6 +334,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUMFMAIntrinsics908 in def : SourceOfDivergence; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 0ad74eb7923a..ab33a90f4974 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -846,5 +846,19 @@ define amdgpu_cs void @call_whole_wave(ptr addrspace(1) %out) { declare amdgpu_gfx_whole_wave i32 @wwf(i1, i32) #0 +; CHECK: DIVERGENT: %set_inactive = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 0) +define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) #0 { + %set_inactive = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 0) #0 + store i32 %set_inactive, ptr addrspace(1) %out, align 4 + ret void +} + +; CHECK: DIVERGENT: %set_inactive = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) +define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %inactive, i32 %active) #0 { + %set_inactive = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0 + store i32 %set_inactive, ptr addrspace(1) %out, align 4 + ret void +} + attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll index c195642aaf6e..c1b9e51ac843 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -21,12 +21,13 @@ define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) { ; GCN-NEXT: ; %bb.1: ; %bb42 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: .LBB0_2: ; %bb602 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GCN-NEXT: s_cbranch_vccnz .LBB0_4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %bb49 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc -; GCN-NEXT: .LBB0_4: ; %bb54 +; GCN-NEXT: .LBB0_4: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm entry: br label %work @@ -75,12 +76,13 @@ define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer ; GCN-NEXT: ; %bb.1: ; %bb42 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: .LBB1_2: ; %bb602 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GCN-NEXT: s_cbranch_vccnz .LBB1_4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_4 ; GCN-NEXT: ; %bb.3: ; %bb49 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: tbuffer_store_format_x v1, off, s[4:7], 1 format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] offset:4 glc -; GCN-NEXT: .LBB1_4: ; %bb54 +; GCN-NEXT: .LBB1_4: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm entry: br label %work