llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
Shilei Tian fc0653f31c
[RFC][NFC][AMDGPU] Remove -verify-machineinstrs from llvm/test/CodeGen/AMDGPU/*.ll (#150024)
Recent upstream trends have moved away from explicitly using `-verify-machineinstrs`, as it's already covered by the expensive checks. This PR removes almost all `-verify-machineinstrs` from tests in `llvm/test/CodeGen/AMDGPU/*.ll`, leaving only those tests where its removal currently causes failures.
2025-07-23 13:42:46 -04:00

616 lines
23 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s
; Simples case, if - then, that requires lane mask merging,
; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
; will overwrite its own lane bit in lane mask with val_B
define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
; GFX10-LABEL: divergent_i1_phi_if_then:
; GFX10: ; %bb.0: ; %A
; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: ; %bb.1: ; %B
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
A:
%val_A = icmp uge i32 %tid, 6
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %B, label %exit
B:
%val_B = icmp ult i32 %tid, 1
br label %exit
exit:
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
%sel = select i1 %phi, i32 1, i32 2
store i32 %sel, ptr addrspace(1) %out
ret void
}
; if - else
define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
; GFX10-LABEL: divergent_i1_phi_if_else:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_and_b32 s0, s0, 1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-NEXT: ; %bb.1: ; %B
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
; GFX10-NEXT: ; implicit-def: $vgpr2
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.2: ; %Flow
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
; GFX10-NEXT: ; %bb.3: ; %A
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %A, label %B
A:
%val_A = icmp uge i32 %tid, 1
br label %exit
B:
%val_B = icmp ult i32 %tid, 2
br label %exit
exit:
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
%sel = select i1 %phi, i32 1, i32 2
store i32 %sel, ptr addrspace(1) %out
ret void
}
; if - break;
; counter = 0;
; do {
; if (a[counter] == 0)
; break;
; if (b[counter] == 0)
; break;
; if (c[counter] == 0)
; break;
; x[counter++]+=1;
; } while (counter<100);
; Tests with multiple break conditions. Divergent phis will be used to track
; if any of the break conditions was reached. We only need to do simple lane
; mask merging (for current loop iteration only). There is an intrinsic,
; if_break, that will merge lane masks across all iterations of the loop.
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB2_2
; GFX10-NEXT: .LBB2_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s5
; GFX10-NEXT: s_or_b32 s4, s1, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB2_4
; GFX10-NEXT: .LBB2_2: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2
; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo
; GFX10-NEXT: s_or_b32 s5, s1, s5
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v5, vcc_lo
; GFX10-NEXT: global_load_dword v4, v[4:5], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB2_1
; GFX10-NEXT: ; %bb.3: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: s_add_i32 s2, s0, 1
; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64
; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
; GFX10-NEXT: s_andn2_b32 s3, s5, exec_lo
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_or_b32 s5, s3, s0
; GFX10-NEXT: global_load_dword v6, v[4:5], off
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v6
; GFX10-NEXT: global_store_dword v[4:5], v6, off
; GFX10-NEXT: s_branch .LBB2_1
; GFX10-NEXT: .LBB2_4: ; %exit
; GFX10-NEXT: s_endpgm
entry:
br label %A
A:
%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
%a.val = load i32, ptr addrspace(1) %a.plus.counter
%a.cond = icmp eq i32 %a.val, 0
br i1 %a.cond, label %exit, label %loop.body
loop.body:
%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
%x.val = load i32, ptr addrspace(1) %x.plus.counter
%x.val.plus.1 = add i32 %x.val, 1
store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
%counter.plus.1 = add i32 %counter, 1
%x.cond = icmp ult i32 %counter, 100
br i1 %x.cond, label %exit, label %A
exit:
ret void
}
define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
; GFX10-LABEL: loop_with_2breaks:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB3_3
; GFX10-NEXT: .LBB3_1: ; %Flow3
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
; GFX10-NEXT: s_andn2_b32 s2, s5, exec_lo
; GFX10-NEXT: s_and_b32 s3, exec_lo, s6
; GFX10-NEXT: s_or_b32 s5, s2, s3
; GFX10-NEXT: .LBB3_2: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s5
; GFX10-NEXT: s_or_b32 s4, s1, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_3: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2
; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v7, s3
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo
; GFX10-NEXT: s_or_b32 s5, s1, s5
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo
; GFX10-NEXT: global_load_dword v6, v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_2
; GFX10-NEXT: ; %bb.4: ; %B
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v7, s3
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo
; GFX10-NEXT: global_load_dword v6, v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_1
; GFX10-NEXT: ; %bb.5: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v7, s3
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: s_add_i32 s2, s0, 1
; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64
; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_or_b32 s6, s3, s0
; GFX10-NEXT: global_load_dword v8, v[6:7], off
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
; GFX10-NEXT: global_store_dword v[6:7], v8, off
; GFX10-NEXT: s_branch .LBB3_1
; GFX10-NEXT: .LBB3_6: ; %exit
; GFX10-NEXT: s_endpgm
entry:
br label %A
A:
%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
%a.val = load i32, ptr addrspace(1) %a.plus.counter
%a.cond = icmp eq i32 %a.val, 0
br i1 %a.cond, label %exit, label %B
B:
%b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
%b.val = load i32, ptr addrspace(1) %b.plus.counter
%b.cond = icmp eq i32 %b.val, 0
br i1 %b.cond, label %exit, label %loop.body
loop.body:
%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
%x.val = load i32, ptr addrspace(1) %x.plus.counter
%x.val.plus.1 = add i32 %x.val, 1
store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
%counter.plus.1 = add i32 %counter, 1
%x.cond = icmp ult i32 %counter, 100
br i1 %x.cond, label %exit, label %A
exit:
ret void
}
define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
; GFX10-LABEL: loop_with_3breaks:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB4_4
; GFX10-NEXT: .LBB4_1: ; %Flow5
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_andn2_b32 s2, s6, exec_lo
; GFX10-NEXT: s_and_b32 s3, exec_lo, s8
; GFX10-NEXT: s_or_b32 s6, s2, s3
; GFX10-NEXT: .LBB4_2: ; %Flow4
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
; GFX10-NEXT: s_andn2_b32 s2, s5, exec_lo
; GFX10-NEXT: s_and_b32 s3, exec_lo, s6
; GFX10-NEXT: s_or_b32 s5, s2, s3
; GFX10-NEXT: .LBB4_3: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s5
; GFX10-NEXT: s_or_b32 s4, s1, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB4_8
; GFX10-NEXT: .LBB4_4: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2
; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v9, s3
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo
; GFX10-NEXT: s_or_b32 s5, s1, s5
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo
; GFX10-NEXT: global_load_dword v8, v[8:9], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_3
; GFX10-NEXT: ; %bb.5: ; %B
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, s3
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v9, vcc_lo
; GFX10-NEXT: global_load_dword v8, v[8:9], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_2
; GFX10-NEXT: ; %bb.6: ; %C
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, s3
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v6, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v7, v9, vcc_lo
; GFX10-NEXT: global_load_dword v8, v[8:9], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_1
; GFX10-NEXT: ; %bb.7: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, s3
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: s_add_i32 s2, s0, 1
; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64
; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
; GFX10-NEXT: s_andn2_b32 s3, s8, exec_lo
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_or_b32 s8, s3, s0
; GFX10-NEXT: global_load_dword v10, v[8:9], off
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v10
; GFX10-NEXT: global_store_dword v[8:9], v10, off
; GFX10-NEXT: s_branch .LBB4_1
; GFX10-NEXT: .LBB4_8: ; %exit
; GFX10-NEXT: s_endpgm
entry:
br label %A
A:
%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
%a.val = load i32, ptr addrspace(1) %a.plus.counter
%a.cond = icmp eq i32 %a.val, 0
br i1 %a.cond, label %exit, label %B
B:
%b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
%b.val = load i32, ptr addrspace(1) %b.plus.counter
%b.cond = icmp eq i32 %b.val, 0
br i1 %b.cond, label %exit, label %C
C:
%c.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %c, i32 %counter
%c.val = load i32, ptr addrspace(1) %c.plus.counter
%c.cond = icmp eq i32 %c.val, 0
br i1 %c.cond, label %exit, label %loop.body
loop.body:
%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
%x.val = load i32, ptr addrspace(1) %x.plus.counter
%x.val.plus.1 = add i32 %x.val, 1
store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
%counter.plus.1 = add i32 %counter, 1
%x.cond = icmp ult i32 %counter, 100
br i1 %x.cond, label %exit, label %A
exit:
ret void
}
; Divergent condition if with body, ending with break. This is loop with two
; exits but structurizer will create phi that will track exit from break
; and move break.body after the loop. Loop will then have one exit and phi
; used outside of the loop by condition used to enter the break.body.
define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
; GFX10-LABEL: loop_with_div_break_with_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_and_b32 s1, exec_lo, s6
; GFX10-NEXT: s_or_b32 s4, s1, s4
; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo
; GFX10-NEXT: s_and_b32 s2, exec_lo, s7
; GFX10-NEXT: s_or_b32 s5, s1, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB5_4
; GFX10-NEXT: .LBB5_2: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2
; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v7, s3
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: s_and_b32 s7, exec_lo, s8
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo
; GFX10-NEXT: s_or_b32 s7, s1, s7
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo
; GFX10-NEXT: s_or_b32 s6, s6, s8
; GFX10-NEXT: global_load_dword v6, v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB5_1
; GFX10-NEXT: ; %bb.3: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: v_mov_b32_e32 v7, s3
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: s_add_i32 s2, s0, 1
; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64
; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
; GFX10-NEXT: s_andn2_b32 s3, s7, exec_lo
; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
; GFX10-NEXT: global_load_dword v8, v[6:7], off
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_or_b32 s7, s3, s7
; GFX10-NEXT: s_or_b32 s6, s6, s0
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
; GFX10-NEXT: global_store_dword v[6:7], v8, off
; GFX10-NEXT: s_branch .LBB5_1
; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s0, s5
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB5_6
; GFX10-NEXT: ; %bb.5: ; %break.body
; GFX10-NEXT: v_mov_b32_e32 v0, 10
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: .LBB5_6: ; %exit
; GFX10-NEXT: s_endpgm
entry:
br label %A
A:
%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
%a.val = load i32, ptr addrspace(1) %a.plus.counter
%a.cond = icmp eq i32 %a.val, 0
br i1 %a.cond, label %break.body, label %loop.body
break.body:
store i32 10, ptr addrspace(1) %a.break
br label %exit
loop.body:
%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
%x.val = load i32, ptr addrspace(1) %x.plus.counter
%x.val.plus.1 = add i32 %x.val, 1
store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
%counter.plus.1 = add i32 %counter, 1
%x.cond = icmp ult i32 %counter, 100
br i1 %x.cond, label %exit, label %A
exit:
ret void
}
; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir
; with irreducible control flow graph.
; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) {
; do {
; if (y < a2) {
; do {
; } while (x < a2);
; }
; if (x < a3) {
; return a1;
; }
; } while (y < a2);
; return a0;
; }
; This test is also interesting because it had phi with three incomings
; After fa4cc9ddd58eb9fef2497e678873ff3b495340a3, FixIrreducible does not
; generate phi with three incomings. There is a mir test with such phi.
define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; GFX10-LABEL: irreducible_cfg:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, v4, v1
; GFX10-NEXT: s_mov_b32 s0, exec_lo
; GFX10-NEXT: s_mov_b32 s1, 0
; GFX10-NEXT: s_and_b32 s2, s0, 1
; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: ; implicit-def: $sgpr2
; GFX10-NEXT: s_cselect_b32 s3, exec_lo, 0
; GFX10-NEXT: s_branch .LBB6_2
; GFX10-NEXT: .LBB6_1: ; %Flow2
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX10-NEXT: s_and_b32 s4, exec_lo, s5
; GFX10-NEXT: s_mov_b32 s0, exec_lo
; GFX10-NEXT: s_or_b32 s1, s4, s1
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_cbranch_execz .LBB6_8
; GFX10-NEXT: .LBB6_2: ; %irr.guard
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB6_6 Depth 2
; GFX10-NEXT: s_mov_b32 s4, exec_lo
; GFX10-NEXT: s_and_saveexec_b32 s5, s0
; GFX10-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX10-NEXT: ; %bb.3: ; %.loopexit
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: v_cmp_gt_i32_e64 s0, v5, v0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_mov_b32 s7, exec_lo
; GFX10-NEXT: s_xor_b32 s6, vcc_lo, s6
; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
; GFX10-NEXT: s_or_b32 s6, s0, s6
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_xor_b32 s6, s6, s7
; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
; GFX10-NEXT: s_or_b32 s3, s3, s0
; GFX10-NEXT: s_or_b32 s4, s4, s6
; GFX10-NEXT: ; %bb.4: ; %Flow1
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_andn2_b32 s0, s2, exec_lo
; GFX10-NEXT: s_and_b32 s2, exec_lo, s3
; GFX10-NEXT: s_mov_b32 s5, exec_lo
; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_and_saveexec_b32 s6, s4
; GFX10-NEXT: s_cbranch_execz .LBB6_1
; GFX10-NEXT: ; %bb.5: ; %.preheader
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: v_cmp_le_i32_e64 s0, v4, v0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB6_6: ; %.inner_loop
; GFX10-NEXT: ; Parent Loop BB6_2 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: s_and_b32 s7, exec_lo, s0
; GFX10-NEXT: s_or_b32 s4, s7, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB6_6
; GFX10-NEXT: ; %bb.7: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_andn2_b32 s0, s5, exec_lo
; GFX10-NEXT: s_and_b32 s4, exec_lo, 0
; GFX10-NEXT: s_or_b32 s5, s0, s4
; GFX10-NEXT: s_branch .LBB6_1
; GFX10-NEXT: .LBB6_8: ; %.exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s2
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
.entry:
%.y_lt_a2 = icmp sgt i32 %a2, %y
%.x_lt_a2 = icmp sgt i32 %a2, %x
%.x_lt_a3 = icmp sgt i32 %a3, %x
br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)'
.preheader: ; if (y < a2),
br label %.inner_loop
.inner_loop: ; do while x < a2
br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit
.loopexit: ; if x < a3
%not.inner_loop = xor i1 %.y_lt_a2, true
%brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)'
%.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0 ; select retrun value a1 'x < a3' or a0 'loop ends'
br i1 %brmerge, label %.exit, label %.preheader
.exit:
ret i32 %.ret
}