From a44c15874dbd22b4eeb99b5bbf604b569c41f63f Mon Sep 17 00:00:00 2001 From: Lakreite Date: Fri, 3 Apr 2026 15:30:47 +0300 Subject: [PATCH] [AMDGPU][CodeGen] Implement SimplifyDemandedBitsForTargetNode for readfirstlane. (#190009) Propagate demanded bits through readfirstlane intrinsic in AMDGPUISelLowering with SimplifyDemandedBitsForTargetNode implementation. This allows upstream zero/sign extensions to be eliminated when only a subset of bits is used after the intrinsic. Partially addresses #128390. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 25 ++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 8 + llvm/test/CodeGen/AMDGPU/always-uniform.ll | 7 +- ...pu-simplify-demanded-bits-readfirstlane.ll | 60 ++++ .../atomic_optimizations_global_pointer.ll | 276 +++++++----------- .../CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll | 18 +- 6 files changed, 203 insertions(+), 191 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e7c5b2cb5cc4..dfff3216e0cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5806,6 +5806,31 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } +bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, + unsigned Depth) const { + switch (Op.getOpcode()) { + case ISD::INTRINSIC_WO_CHAIN: { + switch (Op.getConstantOperandVal(0)) { + case Intrinsic::amdgcn_readfirstlane: { + if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits, + OriginalDemandedElts, Known, TLO, Depth + 1)) + return true; + break; + } + default: + break; + } + break; + } + default: + break; + } + + return false; +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 18fed2ebe6e6..5c0828bf4d9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -277,6 +277,14 @@ public: SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + bool SimplifyDemandedBitsForTargetNode(SDValue Op, + const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth) const override; + void ReplaceNodeResults(SDNode * N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 689b306518c9..44dbc0442b68 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -10,13 +10,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s0, s0, s4 -; GCN-NEXT: s_addc_u32 s1, s1, s5 -; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-NEXT: s_load_dword s4, s[0:1], s4 ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll new file mode 100644 index 000000000000..05832e238774 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-demanded-bits-readfirstlane.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +define void @readfirstlane_demanded_i8_zext_store(i8 %src, ptr addrspace(1) %ptr) { +; GCN-LABEL: readfirstlane_demanded_i8_zext_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_store_byte v[1:2], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %zext = zext i8 %src to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext) + %trunc = trunc i32 %readfirstlane to i8 + store i8 %trunc, ptr addrspace(1) %ptr + ret void +} + +define void @readfirstlane_demanded_i8_sext_store(i8 %src, ptr addrspace(1) %ptr) { +; GCN-LABEL: readfirstlane_demanded_i8_sext_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_store_byte v[1:2], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %sext = sext i8 %src to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sext) + %trunc = trunc i32 %readfirstlane to i8 + store i8 %trunc, ptr addrspace(1) %ptr + ret void +} + +define i16 @readfirstlane_demanded_i16_zext(i16 %src) { +; GCN-LABEL: readfirstlane_demanded_i16_zext: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] + %zext = zext i16 %src to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %zext) + %trunc = trunc i32 %readfirstlane to i16 + ret i16 %trunc +} + +define i16 @readfirstlane_demanded_i16_sext(i16 %src) { +; GCN-LABEL: readfirstlane_demanded_i16_sext: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] + %sext = sext i16 %src to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sext) + %trunc = trunc i32 %readfirstlane to i16 + ret i16 %trunc +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index fbc8b812d96c..dfd56b091748 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -8159,10 +8159,9 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s12 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s12 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX7LESS-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7LESS-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX7LESS-NEXT: s_endpgm @@ -8208,7 +8207,6 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX8-NEXT: .LBB12_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s12 @@ -8260,7 +8258,6 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX9-NEXT: .LBB12_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -8313,12 +8310,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1064-NEXT: .LBB12_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1064-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1064-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX1064-NEXT: s_endpgm @@ -8364,12 +8360,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1032-NEXT: .LBB12_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1032-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1032-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm @@ -8419,14 +8414,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1164-TRUE16-NEXT: .LBB12_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l ; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1164-TRUE16-NEXT: s_endpgm @@ -8476,14 +8469,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1164-FAKE16-NEXT: .LBB12_4: ; %Flow ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1164-FAKE16-NEXT: s_endpgm @@ -8531,14 +8522,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1132-TRUE16-NEXT: .LBB12_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l ; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1132-TRUE16-NEXT: s_endpgm @@ -8586,14 +8575,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1132-FAKE16-NEXT: .LBB12_4: ; %Flow ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1132-FAKE16-NEXT: s_endpgm @@ -8643,13 +8630,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1264-TRUE16-NEXT: .LBB12_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l @@ -8701,13 +8687,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1264-FAKE16-NEXT: .LBB12_4: ; %Flow ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 @@ -8757,13 +8742,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1232-TRUE16-NEXT: .LBB12_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l @@ -8813,13 +8797,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1232-FAKE16-NEXT: .LBB12_4: ; %Flow ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 @@ -8884,7 +8867,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xff ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 @@ -8941,7 +8923,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX8-NEXT: .LBB13_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -8999,7 +8980,6 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX9-NEXT: .LBB13_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9058,10 +9038,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1064-NEXT: .LBB13_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -9115,10 +9094,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1032-NEXT: .LBB13_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -9178,12 +9156,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -9243,12 +9219,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -9305,12 +9279,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -9367,12 +9339,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -9432,13 +9402,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -9498,13 +9466,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -9564,13 +9530,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -9630,13 +9594,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -10064,10 +10026,9 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s12 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s12 +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX7LESS-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX7LESS-NEXT: s_endpgm @@ -10113,7 +10074,6 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX8-NEXT: .LBB15_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s12 @@ -10165,7 +10125,6 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX9-NEXT: .LBB15_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -10218,12 +10177,11 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1064-NEXT: .LBB15_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1064-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1064-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX1064-NEXT: s_endpgm @@ -10269,12 +10227,11 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1032-NEXT: .LBB15_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1032-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1032-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm @@ -10324,13 +10281,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1164-TRUE16-NEXT: .LBB15_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1164-TRUE16-NEXT: s_endpgm @@ -10380,14 +10336,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1164-FAKE16-NEXT: .LBB15_4: ; %Flow ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1164-FAKE16-NEXT: s_endpgm @@ -10435,13 +10389,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1132-TRUE16-NEXT: .LBB15_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1132-TRUE16-NEXT: s_endpgm @@ -10489,14 +10442,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1132-FAKE16-NEXT: .LBB15_4: ; %Flow ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1132-FAKE16-NEXT: s_endpgm @@ -10546,14 +10497,14 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1264-TRUE16-NEXT: .LBB15_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX1264-TRUE16-NEXT: s_endpgm @@ -10603,13 +10554,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 ; GFX1264-FAKE16-NEXT: .LBB15_4: ; %Flow ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 @@ -10659,14 +10609,14 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1232-TRUE16-NEXT: .LBB15_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX1232-TRUE16-NEXT: s_endpgm @@ -10714,13 +10664,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX1232-FAKE16-NEXT: .LBB15_4: ; %Flow ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 @@ -10785,7 +10734,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xffff ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 @@ -10842,7 +10790,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX8-NEXT: .LBB16_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -10900,7 +10847,6 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX9-NEXT: .LBB16_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10959,10 +10905,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1064-NEXT: .LBB16_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -11016,10 +10961,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1032-NEXT: .LBB16_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -11079,11 +11023,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11143,12 +11086,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11205,11 +11146,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11266,12 +11206,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11331,11 +11269,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 @@ -11396,13 +11333,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11462,11 +11397,10 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 @@ -11527,13 +11461,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll index 0290ffdf161c..888da732520e 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll @@ -4,24 +4,14 @@ define amdgpu_gs i32 @main() { ; CHECK-LABEL: main: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_bitcmp1_b32 0, 0 ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_cselect_b32 s1, -1, 0 -; CHECK-NEXT: s_or_saveexec_b32 s2, -1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_readfirstlane_b32 s1, v0 -; CHECK-NEXT: s_mov_b32 exec_lo, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 -; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) ; CHECK-NEXT: s_xor_b32 s0, s0, -1 -; CHECK-NEXT: s_wait_alu depctr_sa_sdst(0) -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_wait_alu depctr_va_sdst(0) ; CHECK-NEXT: ; return to shader part epilog bb: