Revert "Reland "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108054)"" (#108341)

Reverts llvm/llvm-project#108173

si-init-whole-wave.mir crashes on some buildbots (although it passed
both locally with sanitizers enabled and in pre-merge tests).
Investigating.
This commit is contained in:
Diana Picus 2024-09-12 10:12:09 +02:00 committed by GitHub
parent 42494e5175
commit 7792b4ae79
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 5 additions and 1528 deletions

View File

@ -208,16 +208,6 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
// Sets the function into whole-wave-mode and returns whether the lane was
// active when entering the function. A branch depending on this return will
// revert the EXEC mask to what it was when entering the function, thus
// resulting in a no-op. This pattern is used to optimize branches when function
// tails need to be run in whole-wave-mode. It may also have other consequences
// (mostly related to WWM CSR handling) that differentiate it from using
// a plain `amdgcn.init.exec -1`.
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
def int_amdgcn_wavefrontsize :
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

View File

@ -2738,11 +2738,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
case Intrinsic::amdgcn_init_whole_wave:
CurDAG->getMachineFunction()
.getInfo<SIMachineFunctionInfo>()
->setInitWholeWave();
break;
}
SelectCode(N);

View File

@ -1772,14 +1772,6 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
MachineFunction *MF = MI.getParent()->getParent();
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
MFInfo->setInitWholeWave();
return selectImpl(MI, *CoverageInfo);
}
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@ -2107,8 +2099,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:

View File

@ -120,7 +120,6 @@ private:
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectInitWholeWave(MachineInstr &MI) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;

View File

@ -67,8 +67,6 @@ protected:
// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;
bool HasInitWholeWave = false;
public:
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
@ -111,9 +109,6 @@ public:
return WaveLimiter;
}
bool hasInitWholeWave() const { return HasInitWholeWave; }
void setInitWholeWave() { HasInitWholeWave = true; }
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
return allocateLDSGlobal(DL, GV, DynLDSAlign);
}

View File

@ -4997,7 +4997,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
case Intrinsic::amdgcn_init_whole_wave:
case Intrinsic::amdgcn_live_mask: {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;

View File

@ -329,7 +329,6 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;

View File

@ -1740,9 +1740,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
if (YamlMFI.HasInitWholeWave)
MFI->setInitWholeWave();
return false;
}

View File

@ -1343,14 +1343,10 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
// Allocate spill slots for WWM reserved VGPRs.
// For chain functions, we only need to do this if we have calls to
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
// chain functions do not return) and the function did not contain a call to
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
// when entering the function).
bool IsChainWithoutRestores =
FuncInfo->isChainFunction() &&
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
// llvm.amdgcn.cs.chain.
bool IsChainWithoutCalls =
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),

View File

@ -570,16 +570,6 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
let Defs = [EXEC];
}
// Sets EXEC to all lanes and returns the previous EXEC.
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
(outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
let Defs = [EXEC];
let Uses = [EXEC];
let isConvergent = 1;
}
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

View File

@ -295,8 +295,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue SGPRForEXECCopy;
StringValue LongBranchReservedReg;
bool HasInitWholeWave = false;
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
@ -344,7 +342,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue()); // Don't print out when it's empty.
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
}
};

View File

@ -586,8 +586,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@ -1572,33 +1571,6 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
assert(MBB == &MBB->getParent()->front() &&
"init whole wave not in entry block");
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *SaveExec =
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64),
EntryExec)
.addImm(-1);
// Replace all uses of MI's destination reg with EntryExec.
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
}
MI.eraseFromParent();
if (LIS) {
LIS->InsertMachineInstrInMaps(*SaveExec);
LIS->createAndComputeVirtRegInterval(EntryExec);
}
return;
}
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =

File diff suppressed because it is too large Load Diff

View File

@ -1,140 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
; This shouldn't be too different from wave32, so we'll only test one case.
define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 inreg %exec, { i32, ptr addrspace(5), i32, i64 } %vgpr, i32 %x, i32 %y) {
; GISEL12-LABEL: basic:
; GISEL12: ; %bb.0: ; %entry
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL12-NEXT: s_wait_expcnt 0x0
; GISEL12-NEXT: s_wait_samplecnt 0x0
; GISEL12-NEXT: s_wait_bvhcnt 0x0
; GISEL12-NEXT: s_wait_kmcnt 0x0
; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; GISEL12-NEXT: s_mov_b32 s8, s3
; GISEL12-NEXT: s_mov_b32 s9, s4
; GISEL12-NEXT: s_mov_b32 s4, s5
; GISEL12-NEXT: s_mov_b32 s5, s6
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_and_saveexec_b64 s[6:7], s[10:11]
; GISEL12-NEXT: ; %bb.1: ; %shader
; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; GISEL12-NEXT: v_mov_b32_e32 v0, s12
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GISEL12-NEXT: v_mov_b32_e32 v1, s13
; GISEL12-NEXT: s_mov_b64 exec, s[10:11]
; GISEL12-NEXT: v_mov_b32_e32 v11, v0
; GISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GISEL12-NEXT: v_mov_b32_e32 v12, v1
; GISEL12-NEXT: ; %bb.2: ; %tail
; GISEL12-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL12-NEXT: s_mov_b64 exec, s[4:5]
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_setpc_b64 s[8:9]
;
; DAGISEL12-LABEL: basic:
; DAGISEL12: ; %bb.0: ; %entry
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
; DAGISEL12-NEXT: s_wait_expcnt 0x0
; DAGISEL12-NEXT: s_wait_samplecnt 0x0
; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL12-NEXT: s_wait_kmcnt 0x0
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT: s_mov_b32 s7, s6
; DAGISEL12-NEXT: s_mov_b32 s6, s5
; DAGISEL12-NEXT: s_mov_b32 s5, s4
; DAGISEL12-NEXT: s_mov_b32 s4, s3
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; DAGISEL12-NEXT: ; %bb.1: ; %shader
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11]
; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13
; DAGISEL12-NEXT: ; %bb.2: ; %tail
; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9]
; DAGISEL12-NEXT: s_mov_b64 exec, s[6:7]
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
;
; GISEL10-LABEL: basic:
; GISEL10: ; %bb.0: ; %entry
; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
; GISEL10-NEXT: s_mov_b32 s8, s3
; GISEL10-NEXT: s_mov_b32 s9, s4
; GISEL10-NEXT: s_mov_b32 s4, s5
; GISEL10-NEXT: s_mov_b32 s5, s6
; GISEL10-NEXT: s_and_saveexec_b64 s[6:7], s[10:11]
; GISEL10-NEXT: ; %bb.1: ; %shader
; GISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; GISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; GISEL10-NEXT: v_mov_b32_e32 v0, s12
; GISEL10-NEXT: v_mov_b32_e32 v1, s13
; GISEL10-NEXT: s_mov_b64 exec, s[10:11]
; GISEL10-NEXT: v_mov_b32_e32 v11, v0
; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13
; GISEL10-NEXT: v_mov_b32_e32 v12, v1
; GISEL10-NEXT: ; %bb.2: ; %tail
; GISEL10-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL10-NEXT: s_mov_b64 exec, s[4:5]
; GISEL10-NEXT: s_setpc_b64 s[8:9]
;
; DAGISEL10-LABEL: basic:
; DAGISEL10: ; %bb.0: ; %entry
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL10-NEXT: s_mov_b32 s7, s6
; DAGISEL10-NEXT: s_mov_b32 s6, s5
; DAGISEL10-NEXT: s_mov_b32 s5, s4
; DAGISEL10-NEXT: s_mov_b32 s4, s3
; DAGISEL10-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; DAGISEL10-NEXT: ; %bb.1: ; %shader
; DAGISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL10-NEXT: s_mov_b64 exec, s[10:11]
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13
; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13
; DAGISEL10-NEXT: ; %bb.2: ; %tail
; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9]
; DAGISEL10-NEXT: s_mov_b64 exec, s[6:7]
; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
entry:
%entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %entry_exec, label %shader, label %tail
shader:
%nonwwm = add i32 %x, 42
%vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i64} %vgpr, i32 %nonwwm, 2
%full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %x, i32 71)
%non.zero = icmp ne i32 %full.vgpr, 0
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %non.zero)
%wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %ballot)
%vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i64} %vgpr.1, i64 %wwm, 3
br label %tail
tail:
%vgpr.args = phi { i32, ptr addrspace(5), i32, i64} [%vgpr, %entry], [%vgpr.2, %shader]
call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i64 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i64 } %vgpr.args, i32 0)
unreachable
}

View File

@ -10,7 +10,6 @@
define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void}
define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void}
define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void}
@ -134,34 +133,6 @@ body: |
S_ENDPGM 0
...
---
name: dont_preserve_wwm_if_init_whole_wave
tracksRegLiveness: true
frameInfo:
hasTailCall: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
returnsVoid: true
wwmReservedRegs:
- '$vgpr8'
- '$vgpr9'
hasInitWholeWave: true
body: |
bb.0:
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
...
---
name: dont_preserve_non_wwm
tracksRegLiveness: true

View File

@ -1,133 +0,0 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s
---
# Test that we don't do silly things when there is no whole wave mode in the
# shader (aka bb.1).
#
name: test_no_wwm
alignment: 1
exposesReturnsTwice: false
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: test_no_wwm
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:ccr_sgpr_64 = COPY $sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:ccr_sgpr_64 = COPY $sgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], [[S_OR_SAVEEXEC_B32_]], implicit-def dead $scc
; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]]
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 5, [[COPY2]], 0, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY3]], implicit-def $scc
; CHECK-NEXT: $vgpr8 = COPY [[COPY2]]
; CHECK-NEXT: $sgpr0 = COPY [[COPY]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr0
; CHECK-NEXT: SI_CS_CHAIN_TC_W32 [[COPY1]], 0, 0, [[COPY4]], amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
bb.0:
successors: %bb.1, %bb.2
liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8
%9:sreg_32 = COPY $sgpr0
undef %1.sub0:ccr_sgpr_64 = COPY $sgpr1
%1.sub1:ccr_sgpr_64 = COPY $sgpr2
%37:vgpr_32 = COPY $vgpr8
%14:sreg_32_xm0_xexec = SI_INIT_WHOLE_WAVE implicit-def $exec, implicit $exec
%16:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
%38:sreg_32 = S_AND_B32 %16:sreg_32_xm0_xexec, %14:sreg_32_xm0_xexec, implicit-def dead $scc
$exec_lo = S_MOV_B32_term %38:sreg_32
S_CBRANCH_EXECZ %bb.2, implicit $exec
S_BRANCH %bb.1
bb.1:
%37:vgpr_32 = V_ADD_U32_e64 5, %37:vgpr_32, 0, implicit $exec
bb.2:
$exec_lo = S_OR_B32 $exec_lo, %16:sreg_32_xm0_xexec, implicit-def $scc
$vgpr8 = COPY %37:vgpr_32
$sgpr0 = COPY %9:sreg_32
%2:sreg_32 = COPY $sgpr0
SI_CS_CHAIN_TC_W32 %1:ccr_sgpr_64, 0, 0, %2:sreg_32, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
...
---
# Test that we handle WWM in the shader correctly.
#
name: test_wwm_bb1
alignment: 1
exposesReturnsTwice: false
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: test_wwm_bb1
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:ccr_sgpr_64 = COPY $sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:ccr_sgpr_64 = COPY $sgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY4]], [[S_OR_SAVEEXEC_B32_]], implicit-def dead $scc
; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]]
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 24, [[COPY3]], 0, implicit $exec
; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY3]], 71, implicit-def dead $scc, implicit $exec, implicit [[ENTER_STRICT_WWM]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 42, [[V_SET_INACTIVE_B32_]], 0, implicit $exec
; CHECK-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
; CHECK-NEXT: early-clobber [[COPY2]]:vgpr_32 = V_MOV_B32_e32 [[V_ADD_U32_e64_]], implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY4]], implicit-def $scc
; CHECK-NEXT: $vgpr8 = COPY [[COPY2]]
; CHECK-NEXT: $vgpr9 = COPY [[COPY3]]
; CHECK-NEXT: $sgpr0 = COPY [[COPY]]
; CHECK-NEXT: SI_CS_CHAIN_TC_W32 [[COPY1]], 0, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
bb.0:
successors: %bb.1, %bb.2
liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9
%9:sreg_32 = COPY $sgpr0
undef %1.sub0:ccr_sgpr_64 = COPY $sgpr1
%1.sub1:ccr_sgpr_64 = COPY $sgpr2
%40:vgpr_32 = COPY $vgpr9
%36:vgpr_32 = COPY $vgpr8
%14:sreg_32_xm0_xexec = SI_INIT_WHOLE_WAVE implicit-def $exec, implicit $exec
%16:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
%38:sreg_32 = S_AND_B32 %16:sreg_32_xm0_xexec, %14:sreg_32_xm0_xexec, implicit-def dead $scc
$exec_lo = S_MOV_B32_term %38:sreg_32
S_CBRANCH_EXECZ %bb.2, implicit $exec
S_BRANCH %bb.1
bb.1:
%36:vgpr_32 = V_ADD_U32_e64 24, %36:vgpr_32, 0, implicit $exec
%19:vgpr_32 = V_SET_INACTIVE_B32 %36:vgpr_32, 71, implicit-def dead $scc, implicit $exec
%18:vgpr_32 = V_ADD_U32_e64 42, %19:vgpr_32, 0, implicit $exec
%40:vgpr_32 = STRICT_WWM %18:vgpr_32, implicit $exec
bb.2:
$exec_lo = S_OR_B32 $exec_lo, %16:sreg_32_xm0_xexec, implicit-def $scc
$vgpr8 = COPY %40:vgpr_32
$vgpr9 = COPY %36:vgpr_32
$sgpr0 = COPY %9:sreg_32
SI_CS_CHAIN_TC_W32 %1:ccr_sgpr_64, 0, 0, %9:sreg_32, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
...

View File

@ -42,7 +42,6 @@
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
@ -308,7 +307,6 @@
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:

View File

@ -42,7 +42,6 @@
; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
; AFTER-PEI-NEXT: longBranchReservedReg: ''
; AFTER-PEI-NEXT: hasInitWholeWave: false
; AFTER-PEI-NEXT: body:
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
%wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0

View File

@ -42,7 +42,6 @@
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
bb0:

View File

@ -42,7 +42,6 @@
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
bb0:

View File

@ -51,7 +51,6 @@
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@ -155,7 +154,6 @@ body: |
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@ -230,7 +228,6 @@ body: |
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@ -306,7 +303,6 @@ body: |
# FULL-NEXT: vgprForAGPRCopy: ''
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:

View File

@ -51,7 +51,6 @@
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
%gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@ -97,7 +96,6 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@ -167,7 +165,6 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define void @function() {
ret void
@ -219,7 +216,6 @@ define void @function() {
; CHECK-NEXT: vgprForAGPRCopy: ''
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
define void @function_nsz() #0 {
ret void