[DAG] Remove AssertZext if the input is masked (#146052)

Remove AssertZext if the input ensures the assert cannot fail.
This commit is contained in:
Pierre van Houtryve 2025-07-29 13:05:30 +02:00 committed by GitHub
parent d1054e801c
commit 250f2a6367
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 89 additions and 81 deletions

View File

@ -15263,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
}
}
// If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
// than X, and the And doesn't change the lower iX bits, we can move the
// AssertZext in front of the And and drop the AssertSext.
if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND &&
N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext &&
isa<ConstantSDNode>(N0.getOperand(1))) {
SDValue BigA = N0.getOperand(0);
EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
const APInt &Mask = N0.getConstantOperandAPInt(1);
if (AssertVT.bitsLT(BigA_AssertVT) &&
Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
SDLoc DL(N);
SDValue NewAssert =
DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
N0.getOperand(1));
// If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
// than X, and the And doesn't change the lower iX bits, we can move the
// AssertZext in front of the And and drop the AssertSext.
if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) {
SDValue BigA = N0.getOperand(0);
EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
if (AssertVT.bitsLT(BigA_AssertVT) &&
Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
SDLoc DL(N);
SDValue NewAssert =
DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
N0.getOperand(1));
}
}
// Remove AssertZext entirely if the mask guarantees the assertion cannot
// fail.
// TODO: Use KB countMinLeadingZeros to handle non-constant masks?
if (Mask.isIntN(AssertVT.getScalarSizeInBits()))
return N0;
}
return SDValue();

View File

@ -442,9 +442,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
;
; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
; GFX11-NEXT: ds_store_b32 v0, v1 offset:456

View File

@ -714,10 +714,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX11-LABEL: store_load_vindex_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 15
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s0, s0, 7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@ -732,9 +732,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX12-LABEL: store_load_vindex_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@ -769,8 +769,8 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX942-LABEL: store_load_vindex_kernel:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX942-NEXT: v_mov_b32_e32 v1, 15
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s0, s0, 7
@ -809,10 +809,10 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX11-PAL-LABEL: store_load_vindex_kernel:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7
; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@ -827,9 +827,9 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX12-PAL-LABEL: store_load_vindex_kernel:
; GFX12-PAL: ; %bb.0: ; %bb
; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@ -1958,10 +1958,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX11-LABEL: store_load_vindex_small_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@ -1976,10 +1976,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX12-LABEL: store_load_vindex_small_offset_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@ -2021,8 +2021,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX942-NEXT: v_mov_b32_e32 v1, 15
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s0, s0, 7
@ -2092,10 +2092,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
@ -2110,10 +2110,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel:
; GFX12-PAL: ; %bb.0: ; %bb
; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
@ -3254,10 +3254,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX11-LABEL: store_load_vindex_large_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s0, s0, 7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@ -3274,10 +3274,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX12-LABEL: store_load_vindex_large_offset_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@ -3319,8 +3319,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX942-NEXT: v_mov_b32_e32 v1, 15
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s0, s0, 7
@ -3391,10 +3391,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7
; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@ -3411,10 +3411,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel:
; GFX12-PAL: ; %bb.0: ; %bb
; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_storecnt 0x0

View File

@ -15,8 +15,8 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
; GCN-LABEL: test_iglp_opt_mfma_gemm:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v3, 2.0
; GCN-NEXT: ; iglp_opt mask(0x00000000)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -153,8 +153,8 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
; GCN-NEXT: v_mov_b32_e32 v3, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -289,8 +289,8 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias
; GCN-LABEL: test_iglp_opt_asm_sideeffect:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GCN-NEXT: ; iglp_opt mask(0x00000000)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v1, s0, v0

View File

@ -6,9 +6,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad
; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0
; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40
; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
@ -74,9 +74,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster:
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
@ -178,9 +178,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0
; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16
; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16
@ -260,9 +260,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave:
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16

View File

@ -8,10 +8,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr
; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0
; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28
; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
@ -60,10 +60,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0
; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28

View File

@ -7,8 +7,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
; GCN-MINREG: ; %bb.0: ; %entry
; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0
; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
@ -140,8 +140,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
; GCN-MAXOCC: ; %bb.0: ; %entry
; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v1, 7, v0
; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0
; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0
; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
@ -274,8 +274,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
; GCN-ILP: ; %bb.0: ; %entry
; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
@ -469,8 +469,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region:
; GCN-MINREG: ; %bb.0: ; %entry
; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v2, 7, v0
; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0
; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
@ -604,8 +604,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region:
; GCN-MAXOCC: ; %bb.0: ; %entry
; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v3, 7, v0
; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0
; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
@ -739,8 +739,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region:
; GCN-ILP: ; %bb.0: ; %entry
; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-ILP-NEXT: v_lshlrev_b32_e32 v2, 7, v0
; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0
; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -621,8 +621,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v1, s0, v0
; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
@ -728,8 +728,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0
; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112
@ -871,8 +871,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
; GCN-NEXT: v_mov_b32_e32 v3, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@ -1005,8 +1005,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
; EXACTCUTOFF: ; %bb.0: ; %entry
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
@ -1202,7 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, s0, v3
; GCN-NEXT: v_rndne_f32_e32 v5, v4
@ -1212,7 +1212,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_add_f32_e32 v4, v6, v4
; GCN-NEXT: v_exp_f32_e32 v4, v4
; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_add_u32_e32 v1, s6, v0
; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112
; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96
@ -1387,7 +1387,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3
; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4
@ -1397,7 +1397,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4
; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4
; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0
; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112
; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96

View File

@ -208,11 +208,11 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
;
; GFX11-LABEL: dpp_test1:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: ds_load_b32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_barrier

View File

@ -146,8 +146,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
; GCN-LABEL: mubuf_clause:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31
; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v31
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff0, v2
; GCN-NEXT: v_add_u32_e32 v0, v0, v2
; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12
; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
@ -205,8 +205,8 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
; GCN-SCRATCH-LABEL: mubuf_clause:
; GCN-SCRATCH: ; %bb.0: ; %bb
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31
; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2
; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18
; GCN-SCRATCH-NEXT: s_clause 0x3
; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off