AMDGPU/GlobalISel: RegBankLegalize rules for wqm_demote (#188288)

This commit is contained in:
vangthao95 2026-03-25 09:10:28 -07:00 committed by GitHub
parent 6e916d0598
commit 69f9ff6c19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 122 additions and 113 deletions

View File

@ -1660,6 +1660,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
.Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
addRulesForIOpcs({amdgcn_wqm_demote}).Any({{}, {{}, {IntrId, Vcc}}});
addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
.Any({{DivS1}, {{Vcc}, {}}});

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-32 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10-64 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-32 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10-64 %s
define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
; SI-LABEL: static_exact:
@ -159,11 +159,12 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
; SI: ; %bb.0: ; %.entry
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
; SI-NEXT: v_cvt_i32_f32_e32 v1, v1
; SI-NEXT: s_mov_b64 s[2:3], exec
; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: s_xor_b64 s[2:3], vcc, -1
; SI-NEXT: s_xor_b64 s[2:3], vcc, s[2:3]
; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB2_3
@ -186,11 +187,12 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b64 s[2:3], exec
; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1
; GFX9-NEXT: s_xor_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB2_3
@ -213,11 +215,12 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
; GFX10-32: ; %bb.0: ; %.entry
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1
; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, s1
; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1
; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2
; GFX10-32-NEXT: s_cbranch_execz .LBB2_3
@ -240,11 +243,12 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
; GFX10-64: ; %bb.0: ; %.entry
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX10-64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-64-NEXT: s_mov_b64 s[0:1], exec
; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1
; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, s[2:3]
; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
; GFX10-64-NEXT: s_cbranch_execz .LBB2_3
@ -674,18 +678,19 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
; SI-NEXT: .LBB6_3: ; %.continue0
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: s_nop 1
; SI-NEXT: s_mov_b64 s[2:3], exec
; SI-NEXT: s_nop 0
; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT: s_nop 1
; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; SI-NEXT: s_and_b64 exec, exec, s[0:1]
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc
; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc
; SI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB6_6
@ -722,18 +727,19 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
; GFX9-NEXT: s_and_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB6_3: ; %.continue0
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: s_mov_b64 s[2:3], exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], vcc
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB6_6
@ -770,16 +776,17 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT: .LBB6_3: ; %.continue0
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT: s_mov_b32 s1, s0
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1
; GFX10-32-NEXT: s_mov_b32 s2, s0
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s2
; GFX10-32-NEXT: v_mov_b32_e32 v1, v0
; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_and_b32 s1, s0, vcc_lo
; GFX10-32-NEXT: s_xor_b32 s1, s1, -1
; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo
; GFX10-32-NEXT: s_xor_b32 s1, s2, s1
; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1
; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2
; GFX10-32-NEXT: s_cbranch_execz .LBB6_6
@ -816,16 +823,17 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5]
; GFX10-64-NEXT: .LBB6_3: ; %.continue0
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX10-64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
; GFX10-64-NEXT: v_mov_b32_e32 v1, v0
; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX10-64-NEXT: s_and_b64 s[2:3], s[0:1], vcc
; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc
; GFX10-64-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
; GFX10-64-NEXT: s_cbranch_execz .LBB6_6
@ -885,7 +893,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: s_wqm_b64 exec, exec
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@ -894,36 +902,37 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; SI-NEXT: s_cbranch_scc0 .LBB7_9
; SI-NEXT: ; %bb.2: ; %.demote0
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
; SI-NEXT: s_wqm_b64 s[4:5], s[0:1]
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
; SI-NEXT: .LBB7_3: ; %.continue0.preheader
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], 0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_branch .LBB7_5
; SI-NEXT: .LBB7_4: ; %.continue1
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
; SI-NEXT: s_add_i32 s6, s6, 1
; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1
; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
; SI-NEXT: s_cbranch_execz .LBB7_8
; SI-NEXT: .LBB7_5: ; %.continue0
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v3, v2
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b64 s[8:9], s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9]
; SI-NEXT: v_mov_b32_e32 v2, v0
; SI-NEXT: s_mov_b64 s[4:5], exec
; SI-NEXT: s_nop 0
; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT: s_nop 1
; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT: s_nop 1
; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc
; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; SI-NEXT: s_and_b64 s[8:9], s[0:1], vcc
; SI-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; SI-NEXT: s_cbranch_execz .LBB7_4
; SI-NEXT: ; %bb.6: ; %.demote1
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
@ -931,8 +940,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; SI-NEXT: s_cbranch_scc0 .LBB7_9
; SI-NEXT: ; %bb.7: ; %.demote1
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
; SI-NEXT: s_wqm_b64 s[8:9], s[0:1]
; SI-NEXT: s_and_b64 exec, exec, s[8:9]
; SI-NEXT: s_branch .LBB7_4
; SI-NEXT: .LBB7_8: ; %.return
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
@ -951,7 +960,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: s_wqm_b64 exec, exec
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s6, 0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@ -960,36 +969,37 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
; GFX9-NEXT: ; %bb.2: ; %.demote0
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_and_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_branch .LBB7_5
; GFX9-NEXT: .LBB7_4: ; %.continue1
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
; GFX9-NEXT: s_add_i32 s6, s6, 1
; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execz .LBB7_8
; GFX9-NEXT: .LBB7_5: ; %.continue0
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v3, v2
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_b64 s[8:9], s[0:1], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GFX9-NEXT: s_cbranch_execz .LBB7_4
; GFX9-NEXT: ; %bb.6: ; %.demote1
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
@ -997,8 +1007,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
; GFX9-NEXT: ; %bb.7: ; %.demote1
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1]
; GFX9-NEXT: s_and_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_branch .LBB7_4
; GFX9-NEXT: .LBB7_8: ; %.return
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
@ -1030,29 +1040,30 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT: v_mov_b32_e32 v0, s1
; GFX10-32-NEXT: s_mov_b32 s2, 0
; GFX10-32-NEXT: s_branch .LBB7_5
; GFX10-32-NEXT: .LBB7_4: ; %.continue1
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT: s_add_i32 s2, s2, 1
; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1
; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT: s_cbranch_execz .LBB7_8
; GFX10-32-NEXT: .LBB7_5: ; %.continue0
; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-32-NEXT: s_mov_b32 s2, s0
; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
; GFX10-32-NEXT: v_mov_b32_e32 v3, v2
; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo
; GFX10-32-NEXT: s_xor_b32 s2, s2, -1
; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2
; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3
; GFX10-32-NEXT: s_mov_b32 s4, s0
; GFX10-32-NEXT: s_mov_b32 s3, exec_lo
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s4
; GFX10-32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_and_b32 s4, s0, vcc_lo
; GFX10-32-NEXT: s_xor_b32 s3, s4, s3
; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3
; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4
; GFX10-32-NEXT: s_cbranch_execz .LBB7_4
; GFX10-32-NEXT: ; %bb.6: ; %.demote1
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
@ -1060,8 +1071,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9
; GFX10-32-NEXT: ; %bb.7: ; %.demote1
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX10-32-NEXT: s_wqm_b32 s3, s0
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT: s_wqm_b32 s4, s0
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4
; GFX10-32-NEXT: s_branch .LBB7_4
; GFX10-32-NEXT: .LBB7_8: ; %.return
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
@ -1080,7 +1091,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-64-NEXT: s_mov_b64 s[0:1], exec
; GFX10-64-NEXT: s_wqm_b64 exec, exec
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-64-NEXT: s_mov_b32 s4, 0
; GFX10-64-NEXT: s_mov_b32 s6, 0
; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@ -1089,34 +1100,34 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9
; GFX10-64-NEXT: ; %bb.2: ; %.demote0
; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7]
; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5]
; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10-64-NEXT: v_mov_b32_e32 v0, s4
; GFX10-64-NEXT: s_mov_b64 s[2:3], 0
; GFX10-64-NEXT: s_branch .LBB7_5
; GFX10-64-NEXT: .LBB7_4: ; %.continue1
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
; GFX10-64-NEXT: s_add_i32 s6, s6, 1
; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1
; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX10-64-NEXT: s_cbranch_execz .LBB7_8
; GFX10-64-NEXT: .LBB7_5: ; %.continue0
; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
; GFX10-64-NEXT: v_mov_b32_e32 v3, v2
; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc
; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX10-64-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX10-64-NEXT: s_mov_b64 s[4:5], exec
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[8:9]
; GFX10-64-NEXT: v_mov_b32_e32 v2, v0
; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX10-64-NEXT: s_and_b64 s[8:9], s[0:1], vcc
; GFX10-64-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GFX10-64-NEXT: s_cbranch_execz .LBB7_4
; GFX10-64-NEXT: ; %bb.6: ; %.demote1
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
@ -1124,8 +1135,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9
; GFX10-64-NEXT: ; %bb.7: ; %.demote1
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7]
; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9]
; GFX10-64-NEXT: s_branch .LBB7_4
; GFX10-64-NEXT: .LBB7_8: ; %.return
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]

View File

@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s| FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s| FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: wqm_demote_scc
@ -15,9 +14,8 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1)
; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[ICMP]](s32)
; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[AMDGPU_COPY_VCC_SCC]](s1)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
@ -52,9 +50,8 @@ body: |
bb.0:
; CHECK-LABEL: name: wqm_demote_constant_true
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[C]](s32)
; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[AMDGPU_COPY_VCC_SCC]](s1)
%0:_(s1) = G_CONSTANT i1 true
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
...
@ -67,9 +64,8 @@ body: |
bb.0:
; CHECK-LABEL: name: wqm_demote_constant_false
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[C]](s32)
; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[AMDGPU_COPY_VCC_SCC]](s1)
%0:_(s1) = G_CONSTANT i1 false
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
...