llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
Matt Arsenault 39bf765bb6
DAG: Use phi to create vregs instead of the constant input (#129464)
For most targets, the register class comes from the type so this
makes no difference. For AMDGPU, the selected register class depends
on the divergence of the value. For a constant phi input, this will
always be false. The heuristic for whether to treat the value as
a scalar or vector constant based on the uses would then incorrectly
think this is a scalar use, when really the phi is a copy from S to V.

This avoids an intermediate s_mov_b32 plus a copy in some cases. These
would often, but not always, fold out in mi passes.

This only adjusts the constant input case. It may make sense to do
this for the non-constant case as well.
2025-03-04 14:44:54 +07:00

594 lines
19 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
declare i32 @llvm.amdgcn.ballot.i32(i1)
declare i32 @llvm.ctpop.i32(i32)
; Test ballot(0)
define amdgpu_cs i32 @constant_false() {
; CHECK-LABEL: constant_false:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: ; return to shader part epilog
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0)
ret i32 %ballot
}
; Test ballot(1)
define amdgpu_cs i32 @constant_true() {
; CHECK-LABEL: constant_true:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s0, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1)
ret i32 %ballot
}
; Test ballot of a non-comparison operation
define amdgpu_cs i32 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
ret i32 %ballot
}
; Test ballot of comparisons
define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
; CHECK-LABEL: compare_ints:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp eq i32 %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
ret i32 %ballot
}
define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
; CHECK-LABEL: compare_int_with_constant:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp sge i32 %x, 99
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
ret i32 %ballot
}
define amdgpu_cs i32 @compare_floats(float %x, float %y) {
; CHECK-LABEL: compare_floats:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
ret i32 %ballot
}
define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
; CHECK-LABEL: ctpop_of_ballot:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
%bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
ret i32 %bcnt
}
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_cbranch_vccz .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB7_3
; CHECK-NEXT: .LBB7_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB7_3
; CHECK-NEXT: .LBB7_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
; CHECK-NEXT: s_cbranch_vccz .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
; CHECK-NEXT: .LBB8_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB8_3
; CHECK-NEXT: .LBB8_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_cbranch_vccz .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB9_3
; CHECK-NEXT: .LBB9_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB9_3
; CHECK-NEXT: .LBB9_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
; CHECK-NEXT: s_cbranch_vccz .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: s_cbranch_vccz .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB11_3
; CHECK-NEXT: .LBB11_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB11_3
; CHECK-NEXT: .LBB11_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
; CHECK-NEXT: s_cbranch_vccz .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
; CHECK-NEXT: .LBB12_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB12_3
; CHECK-NEXT: .LBB12_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: s_cbranch_vccz .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB13_3
; CHECK-NEXT: .LBB13_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB13_3
; CHECK-NEXT: .LBB13_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
; CHECK-NEXT: s_cbranch_vccz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; CHECK-NEXT: s_cbranch_vccz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB15_3
; CHECK-NEXT: .LBB15_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB15_3
; CHECK-NEXT: .LBB15_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
; CHECK-NEXT: .LBB16_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB16_3
; CHECK-NEXT: .LBB16_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; CHECK-NEXT: s_cbranch_vccz .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB17_3
; CHECK-NEXT: .LBB17_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB17_3
; CHECK-NEXT: .LBB17_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
; CHECK-NEXT: s_cmp_lt_i32 s0, 23
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB19_3
; CHECK-NEXT: .LBB19_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB19_3
; CHECK-NEXT: .LBB19_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%bc = icmp sgt i32 %ballot, 22
br i1 %bc, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; CHECK-NEXT: s_cbranch_vccnz .LBB20_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB20_3
; CHECK-NEXT: .LBB20_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB20_3
; CHECK-NEXT: .LBB20_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; TODO:
; s_cmp_lt_u32 s0, 12
; s_cselect_b32 s0, -1, 0
; s_cmp_gt_u32 s1, 34
; s_cselect_b32 s1, -1, 0
; s_and_b32 s0, s0, s1
; s_and_b32 s0, s0, exec_lo
; could be improved to:
; s_cmp_lt_u32 s0, 12
; s_cselect_b32 s0, -1, 0
; s_cmp_gt_u32 s1, 34
; s_cselect_b32 s0, s0, 0
; s_and_b32 s0, s0, exec_lo
; By selecting into vcc(_lo) instead, we could even avoid the AND-with-exec.
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB21_3
; CHECK-NEXT: .LBB21_2: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB21_3
; CHECK-NEXT: .LBB21_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
%ballot_ne_zero = icmp ne i32 %ballot, 0
br i1 %ballot_ne_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; CHECK-NEXT: s_cbranch_vccnz .LBB22_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB22_3
; CHECK-NEXT: .LBB22_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB22_3
; CHECK-NEXT: .LBB22_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cbranch_scc1 .LBB23_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
; CHECK-NEXT: s_branch .LBB23_3
; CHECK-NEXT: .LBB23_2: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB23_3
; CHECK-NEXT: .LBB23_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
%ballot_eq_zero = icmp eq i32 %ballot, 0
br i1 %ballot_eq_zero, label %true, label %false
true:
ret i32 42
false:
ret i32 33
}
; Input that is not constant or direct result of a compare.
; Tests setting 0 to inactive lanes.
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
; GFX10-LABEL: non_cst_non_compare_input:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: ; implicit-def: $sgpr0
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-NEXT: ; %bb.1: ; %B
; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
; GFX10-NEXT: ; implicit-def: $vgpr2
; GFX10-NEXT: ; %bb.2: ; %Flow
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
; GFX10-NEXT: ; %bb.3: ; %A
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: non_cst_non_compare_input:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_mov_b32 s1, exec_lo
; GFX11-NEXT: ; implicit-def: $sgpr0
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX11-NEXT: ; %bb.1: ; %B
; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
; GFX11-NEXT: ; implicit-def: $vgpr2
; GFX11-NEXT: ; %bb.2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
; GFX11-NEXT: ; %bb.3: ; %A
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
; GFX11-NEXT: s_or_b32 s0, s0, s2
; GFX11-NEXT: ; %bb.4: ; %exit
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %A, label %B
A:
%val_A = icmp uge i32 %tid, 1
br label %exit
B:
%val_B = icmp ult i32 %tid, 2
br label %exit
exit:
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
store i32 %ballot, ptr addrspace(1) %out
ret void
}