
For most targets, the register class comes from the type so this makes no difference. For AMDGPU, the selected register class depends on the divergence of the value. For a constant phi input, this will always be false. The heuristic for whether to treat the value as a scalar or vector constant based on the uses would then incorrectly think this is a scalar use, when really the phi is a copy from S to V. This avoids an intermediate s_mov_b32 plus a copy in some cases. These would often, but not always, fold out in mi passes. This only adjusts the constant input case. It may make sense to do this for the non-constant case as well.
594 lines
19 KiB
LLVM
594 lines
19 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
|
|
|
|
declare i32 @llvm.amdgcn.ballot.i32(i1)
|
|
declare i32 @llvm.ctpop.i32(i32)
|
|
|
|
; Test ballot(0)
|
|
|
|
define amdgpu_cs i32 @constant_false() {
|
|
; CHECK-LABEL: constant_false:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_mov_b32 s0, 0
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0)
|
|
ret i32 %ballot
|
|
}
|
|
|
|
; Test ballot(1)
|
|
|
|
define amdgpu_cs i32 @constant_true() {
|
|
; CHECK-LABEL: constant_true:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_mov_b32 s0, exec_lo
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1)
|
|
ret i32 %ballot
|
|
}
|
|
|
|
; Test ballot of a non-comparison operation
|
|
|
|
define amdgpu_cs i32 @non_compare(i32 %x) {
|
|
; CHECK-LABEL: non_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%trunc = trunc i32 %x to i1
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
|
|
ret i32 %ballot
|
|
}
|
|
|
|
; Test ballot of comparisons
|
|
|
|
define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
|
|
; CHECK-LABEL: compare_ints:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%cmp = icmp eq i32 %x, %y
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
|
|
ret i32 %ballot
|
|
}
|
|
|
|
define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
|
|
; CHECK-LABEL: compare_int_with_constant:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%cmp = icmp sge i32 %x, 99
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
|
|
ret i32 %ballot
|
|
}
|
|
|
|
define amdgpu_cs i32 @compare_floats(float %x, float %y) {
|
|
; CHECK-LABEL: compare_floats:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%cmp = fcmp ogt float %x, %y
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
|
|
ret i32 %ballot
|
|
}
|
|
|
|
define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
|
|
; CHECK-LABEL: ctpop_of_ballot:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
|
|
; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
|
|
; CHECK-NEXT: ; return to shader part epilog
|
|
%cmp = fcmp ogt float %x, %y
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
|
|
%bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
|
|
ret i32 %bcnt
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
|
|
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB7_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB7_3
|
|
; CHECK-NEXT: .LBB7_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB7_3
|
|
; CHECK-NEXT: .LBB7_3:
|
|
%c = trunc i32 %v to i1
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
|
|
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_and_b32 s0, s0, 1
|
|
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB8_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB8_3
|
|
; CHECK-NEXT: .LBB8_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB8_3
|
|
; CHECK-NEXT: .LBB8_3:
|
|
%c = trunc i32 %v to i1
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
|
|
; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB9_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB9_3
|
|
; CHECK-NEXT: .LBB9_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB9_3
|
|
; CHECK-NEXT: .LBB9_3:
|
|
%c = trunc i32 %v to i1
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
|
|
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_and_b32 s0, s0, 1
|
|
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB10_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB10_3
|
|
; CHECK-NEXT: .LBB10_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB10_3
|
|
; CHECK-NEXT: .LBB10_3:
|
|
%c = trunc i32 %v to i1
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
|
|
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB11_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB11_3
|
|
; CHECK-NEXT: .LBB11_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB11_3
|
|
; CHECK-NEXT: .LBB11_3:
|
|
%c = icmp ult i32 %v, 12
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
|
|
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB12_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB12_3
|
|
; CHECK-NEXT: .LBB12_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB12_3
|
|
; CHECK-NEXT: .LBB12_3:
|
|
%c = icmp ult i32 %v, 12
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
|
|
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB13_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB13_3
|
|
; CHECK-NEXT: .LBB13_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB13_3
|
|
; CHECK-NEXT: .LBB13_3:
|
|
%c = icmp ult i32 %v, 12
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
|
|
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB14_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB14_3
|
|
; CHECK-NEXT: .LBB14_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB14_3
|
|
; CHECK-NEXT: .LBB14_3:
|
|
%c = icmp ult i32 %v, 12
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
|
|
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
|
|
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB15_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB15_3
|
|
; CHECK-NEXT: .LBB15_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB15_3
|
|
; CHECK-NEXT: .LBB15_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
|
|
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
|
|
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
|
|
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
|
|
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
|
|
; CHECK-NEXT: s_and_b32 s0, s0, s1
|
|
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
|
|
; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB16_3
|
|
; CHECK-NEXT: .LBB16_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB16_3
|
|
; CHECK-NEXT: .LBB16_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
|
|
; CHECK-LABEL: branch_divergent_ballot_eq_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
|
|
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
|
|
; CHECK-NEXT: s_cbranch_vccz .LBB17_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB17_3
|
|
; CHECK-NEXT: .LBB17_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB17_3
|
|
; CHECK-NEXT: .LBB17_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
|
|
; CHECK-LABEL: branch_uniform_ballot_eq_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
|
|
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
|
|
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
|
|
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
|
|
; CHECK-NEXT: s_and_b32 s0, s0, s1
|
|
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
|
|
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB18_3
|
|
; CHECK-NEXT: .LBB18_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB18_3
|
|
; CHECK-NEXT: .LBB18_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
|
|
; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
|
|
; CHECK-NEXT: s_cmp_lt_i32 s0, 23
|
|
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB19_3
|
|
; CHECK-NEXT: .LBB19_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB19_3
|
|
; CHECK-NEXT: .LBB19_3:
|
|
%c = icmp ult i32 %v, 12
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
|
|
%bc = icmp sgt i32 %ballot, 22
|
|
br i1 %bc, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
|
|
|
|
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
|
|
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
|
|
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
|
|
; CHECK-NEXT: s_cbranch_vccnz .LBB20_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB20_3
|
|
; CHECK-NEXT: .LBB20_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB20_3
|
|
; CHECK-NEXT: .LBB20_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
|
|
; TODO:
|
|
; s_cmp_lt_u32 s0, 12
|
|
; s_cselect_b32 s0, -1, 0
|
|
; s_cmp_gt_u32 s1, 34
|
|
; s_cselect_b32 s1, -1, 0
|
|
; s_and_b32 s0, s0, s1
|
|
; s_and_b32 s0, s0, exec_lo
|
|
; could be improved to:
|
|
; s_cmp_lt_u32 s0, 12
|
|
; s_cselect_b32 s0, -1, 0
|
|
; s_cmp_gt_u32 s1, 34
|
|
; s_cselect_b32 s0, s0, 0
|
|
; s_and_b32 s0, s0, exec_lo
|
|
; By selecting into vcc(_lo) instead, we could even avoid the AND-with-exec.
|
|
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
|
|
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
|
|
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
|
|
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
|
|
; CHECK-NEXT: s_and_b32 s0, s0, s1
|
|
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
|
|
; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
|
|
; CHECK-NEXT: ; %bb.1: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB21_3
|
|
; CHECK-NEXT: .LBB21_2: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB21_3
|
|
; CHECK-NEXT: .LBB21_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
|
|
%ballot_ne_zero = icmp ne i32 %ballot, 0
|
|
br i1 %ballot_ne_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
|
|
; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
|
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
|
|
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
|
|
; CHECK-NEXT: s_cbranch_vccnz .LBB22_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB22_3
|
|
; CHECK-NEXT: .LBB22_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB22_3
|
|
; CHECK-NEXT: .LBB22_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
|
|
; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
|
|
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
|
|
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
|
|
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
|
|
; CHECK-NEXT: s_and_b32 s0, s0, s1
|
|
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
|
|
; CHECK-NEXT: s_cbranch_scc1 .LBB23_2
|
|
; CHECK-NEXT: ; %bb.1: ; %false
|
|
; CHECK-NEXT: s_mov_b32 s0, 33
|
|
; CHECK-NEXT: s_branch .LBB23_3
|
|
; CHECK-NEXT: .LBB23_2: ; %true
|
|
; CHECK-NEXT: s_mov_b32 s0, 42
|
|
; CHECK-NEXT: s_branch .LBB23_3
|
|
; CHECK-NEXT: .LBB23_3:
|
|
%v1c = icmp ult i32 %v1, 12
|
|
%v2c = icmp ugt i32 %v2, 34
|
|
%c = and i1 %v1c, %v2c
|
|
%ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
|
|
%ballot_eq_zero = icmp eq i32 %ballot, 0
|
|
br i1 %ballot_eq_zero, label %true, label %false
|
|
true:
|
|
ret i32 42
|
|
false:
|
|
ret i32 33
|
|
}
|
|
|
|
; Input that is not constant or direct result of a compare.
|
|
; Tests setting 0 to inactive lanes.
|
|
define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
|
|
; GFX10-LABEL: non_cst_non_compare_input:
|
|
; GFX10: ; %bb.0: ; %entry
|
|
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
|
|
; GFX10-NEXT: ; implicit-def: $sgpr0
|
|
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
|
|
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
|
|
; GFX10-NEXT: ; %bb.1: ; %B
|
|
; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
|
|
; GFX10-NEXT: ; implicit-def: $vgpr2
|
|
; GFX10-NEXT: ; %bb.2: ; %Flow
|
|
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
|
|
; GFX10-NEXT: ; %bb.3: ; %A
|
|
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
|
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
|
|
; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo
|
|
; GFX10-NEXT: s_or_b32 s0, s0, s2
|
|
; GFX10-NEXT: ; %bb.4: ; %exit
|
|
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
|
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: non_cst_non_compare_input:
|
|
; GFX11: ; %bb.0: ; %entry
|
|
; GFX11-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX11-NEXT: ; implicit-def: $sgpr0
|
|
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3
|
|
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1
|
|
; GFX11-NEXT: ; %bb.1: ; %B
|
|
; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
|
|
; GFX11-NEXT: ; implicit-def: $vgpr2
|
|
; GFX11-NEXT: ; %bb.2: ; %Flow
|
|
; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1
|
|
; GFX11-NEXT: ; %bb.3: ; %A
|
|
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
|
; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo
|
|
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
|
|
; GFX11-NEXT: s_or_b32 s0, s0, s2
|
|
; GFX11-NEXT: ; %bb.4: ; %exit
|
|
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
|
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
|
|
; GFX11-NEXT: s_endpgm
|
|
entry:
|
|
%cmp = icmp eq i32 %cond, 0
|
|
br i1 %cmp, label %A, label %B
|
|
|
|
A:
|
|
%val_A = icmp uge i32 %tid, 1
|
|
br label %exit
|
|
|
|
B:
|
|
%val_B = icmp ult i32 %tid, 2
|
|
br label %exit
|
|
|
|
exit:
|
|
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
|
|
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
|
|
store i32 %ballot, ptr addrspace(1) %out
|
|
ret void
|
|
}
|