Reverts llvm/llvm-project#108173 si-init-whole-wave.mir crashes on some buildbots (although it passed both locally with sanitizers enabled and in pre-merge tests). Investigating.
This commit is contained in:
parent
42494e5175
commit
7792b4ae79
@ -208,16 +208,6 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
|
||||
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
|
||||
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
|
||||
|
||||
// Sets the function into whole-wave-mode and returns whether the lane was
|
||||
// active when entering the function. A branch depending on this return will
|
||||
// revert the EXEC mask to what it was when entering the function, thus
|
||||
// resulting in a no-op. This pattern is used to optimize branches when function
|
||||
// tails need to be run in whole-wave-mode. It may also have other consequences
|
||||
// (mostly related to WWM CSR handling) that differentiate it from using
|
||||
// a plain `amdgcn.init.exec -1`.
|
||||
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
|
||||
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
|
||||
|
||||
def int_amdgcn_wavefrontsize :
|
||||
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
|
||||
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
|
||||
|
||||
@ -2738,11 +2738,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
||||
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
|
||||
SelectDSBvhStackIntrinsic(N);
|
||||
return;
|
||||
case Intrinsic::amdgcn_init_whole_wave:
|
||||
CurDAG->getMachineFunction()
|
||||
.getInfo<SIMachineFunctionInfo>()
|
||||
->setInitWholeWave();
|
||||
break;
|
||||
}
|
||||
|
||||
SelectCode(N);
|
||||
|
||||
@ -1772,14 +1772,6 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
|
||||
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
|
||||
MachineFunction *MF = MI.getParent()->getParent();
|
||||
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
MFInfo->setInitWholeWave();
|
||||
return selectImpl(MI, *CoverageInfo);
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
|
||||
if (TM.getOptLevel() > CodeGenOptLevel::None) {
|
||||
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
|
||||
@ -2107,8 +2099,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
|
||||
return selectDSAppendConsume(I, true);
|
||||
case Intrinsic::amdgcn_ds_consume:
|
||||
return selectDSAppendConsume(I, false);
|
||||
case Intrinsic::amdgcn_init_whole_wave:
|
||||
return selectInitWholeWave(I);
|
||||
case Intrinsic::amdgcn_s_barrier:
|
||||
return selectSBarrier(I);
|
||||
case Intrinsic::amdgcn_raw_buffer_load_lds:
|
||||
|
||||
@ -120,7 +120,6 @@ private:
|
||||
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
|
||||
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
|
||||
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
|
||||
bool selectInitWholeWave(MachineInstr &MI) const;
|
||||
bool selectSBarrier(MachineInstr &MI) const;
|
||||
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
|
||||
|
||||
|
||||
@ -67,8 +67,6 @@ protected:
|
||||
// Kernel may need limited waves per EU for better performance.
|
||||
bool WaveLimiter = false;
|
||||
|
||||
bool HasInitWholeWave = false;
|
||||
|
||||
public:
|
||||
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
|
||||
|
||||
@ -111,9 +109,6 @@ public:
|
||||
return WaveLimiter;
|
||||
}
|
||||
|
||||
bool hasInitWholeWave() const { return HasInitWholeWave; }
|
||||
void setInitWholeWave() { HasInitWholeWave = true; }
|
||||
|
||||
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
|
||||
return allocateLDSGlobal(DL, GV, DynLDSAlign);
|
||||
}
|
||||
|
||||
@ -4997,7 +4997,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_init_whole_wave:
|
||||
case Intrinsic::amdgcn_live_mask: {
|
||||
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
|
||||
break;
|
||||
|
||||
@ -329,7 +329,6 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
|
||||
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
|
||||
def : SourceOfDivergence<int_amdgcn_update_dpp>;
|
||||
def : SourceOfDivergence<int_amdgcn_writelane>;
|
||||
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
|
||||
|
||||
foreach intr = AMDGPUMFMAIntrinsics908 in
|
||||
def : SourceOfDivergence<intr>;
|
||||
|
||||
@ -1740,9 +1740,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
|
||||
? DenormalMode::IEEE
|
||||
: DenormalMode::PreserveSign;
|
||||
|
||||
if (YamlMFI.HasInitWholeWave)
|
||||
MFI->setInitWholeWave();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -1343,14 +1343,10 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
|
||||
|
||||
// Allocate spill slots for WWM reserved VGPRs.
|
||||
// For chain functions, we only need to do this if we have calls to
|
||||
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
|
||||
// chain functions do not return) and the function did not contain a call to
|
||||
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
|
||||
// when entering the function).
|
||||
bool IsChainWithoutRestores =
|
||||
FuncInfo->isChainFunction() &&
|
||||
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
|
||||
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
|
||||
// llvm.amdgcn.cs.chain.
|
||||
bool IsChainWithoutCalls =
|
||||
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
|
||||
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
|
||||
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
|
||||
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
|
||||
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
|
||||
|
||||
@ -570,16 +570,6 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
|
||||
let Defs = [EXEC];
|
||||
}
|
||||
|
||||
// Sets EXEC to all lanes and returns the previous EXEC.
|
||||
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
|
||||
(outs SReg_1:$dst), (ins),
|
||||
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
|
||||
let Defs = [EXEC];
|
||||
let Uses = [EXEC];
|
||||
|
||||
let isConvergent = 1;
|
||||
}
|
||||
|
||||
// Return for returning shaders to a shader variant epilog.
|
||||
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
|
||||
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
|
||||
|
||||
@ -295,8 +295,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
|
||||
StringValue SGPRForEXECCopy;
|
||||
StringValue LongBranchReservedReg;
|
||||
|
||||
bool HasInitWholeWave = false;
|
||||
|
||||
SIMachineFunctionInfo() = default;
|
||||
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
|
||||
const TargetRegisterInfo &TRI,
|
||||
@ -344,7 +342,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
|
||||
StringValue()); // Don't print out when it's empty.
|
||||
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
|
||||
StringValue());
|
||||
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -586,8 +586,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
||||
KillInstrs.push_back(&MI);
|
||||
BBI.NeedsLowering = true;
|
||||
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
|
||||
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
|
||||
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
|
||||
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
|
||||
InitExecInstrs.push_back(&MI);
|
||||
} else if (WQMOutputs) {
|
||||
// The function is in machine SSA form, which means that physical
|
||||
@ -1572,33 +1571,6 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
bool IsWave32 = ST->isWave32();
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
|
||||
assert(MBB == &MBB->getParent()->front() &&
|
||||
"init whole wave not in entry block");
|
||||
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
|
||||
MachineInstr *SaveExec =
|
||||
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
|
||||
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
|
||||
: AMDGPU::S_OR_SAVEEXEC_B64),
|
||||
EntryExec)
|
||||
.addImm(-1);
|
||||
|
||||
// Replace all uses of MI's destination reg with EntryExec.
|
||||
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
|
||||
|
||||
if (LIS) {
|
||||
LIS->RemoveMachineInstrFromMaps(MI);
|
||||
}
|
||||
|
||||
MI.eraseFromParent();
|
||||
|
||||
if (LIS) {
|
||||
LIS->InsertMachineInstrInMaps(*SaveExec);
|
||||
LIS->createAndComputeVirtRegInterval(EntryExec);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
|
||||
// This should be before all vector instructions.
|
||||
MachineInstr *InitMI =
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,140 +0,0 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
|
||||
; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
|
||||
; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
|
||||
; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
|
||||
|
||||
; This shouldn't be too different from wave32, so we'll only test one case.
|
||||
|
||||
define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 inreg %exec, { i32, ptr addrspace(5), i32, i64 } %vgpr, i32 %x, i32 %y) {
|
||||
; GISEL12-LABEL: basic:
|
||||
; GISEL12: ; %bb.0: ; %entry
|
||||
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GISEL12-NEXT: s_wait_expcnt 0x0
|
||||
; GISEL12-NEXT: s_wait_samplecnt 0x0
|
||||
; GISEL12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GISEL12-NEXT: s_wait_kmcnt 0x0
|
||||
; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GISEL12-NEXT: s_mov_b32 s8, s3
|
||||
; GISEL12-NEXT: s_mov_b32 s9, s4
|
||||
; GISEL12-NEXT: s_mov_b32 s4, s5
|
||||
; GISEL12-NEXT: s_mov_b32 s5, s6
|
||||
; GISEL12-NEXT: s_wait_alu 0xfffe
|
||||
; GISEL12-NEXT: s_and_saveexec_b64 s[6:7], s[10:11]
|
||||
; GISEL12-NEXT: ; %bb.1: ; %shader
|
||||
; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GISEL12-NEXT: s_wait_alu 0xfffe
|
||||
; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
|
||||
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
|
||||
; GISEL12-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GISEL12-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GISEL12-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GISEL12-NEXT: v_mov_b32_e32 v11, v0
|
||||
; GISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
|
||||
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GISEL12-NEXT: v_mov_b32_e32 v12, v1
|
||||
; GISEL12-NEXT: ; %bb.2: ; %tail
|
||||
; GISEL12-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GISEL12-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL12-NEXT: s_wait_alu 0xfffe
|
||||
; GISEL12-NEXT: s_setpc_b64 s[8:9]
|
||||
;
|
||||
; DAGISEL12-LABEL: basic:
|
||||
; DAGISEL12: ; %bb.0: ; %entry
|
||||
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; DAGISEL12-NEXT: s_wait_expcnt 0x0
|
||||
; DAGISEL12-NEXT: s_wait_samplecnt 0x0
|
||||
; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
|
||||
; DAGISEL12-NEXT: s_wait_kmcnt 0x0
|
||||
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; DAGISEL12-NEXT: s_mov_b32 s7, s6
|
||||
; DAGISEL12-NEXT: s_mov_b32 s6, s5
|
||||
; DAGISEL12-NEXT: s_mov_b32 s5, s4
|
||||
; DAGISEL12-NEXT: s_mov_b32 s4, s3
|
||||
; DAGISEL12-NEXT: s_wait_alu 0xfffe
|
||||
; DAGISEL12-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
|
||||
; DAGISEL12-NEXT: ; %bb.1: ; %shader
|
||||
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; DAGISEL12-NEXT: s_wait_alu 0xfffe
|
||||
; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
|
||||
; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
|
||||
; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12
|
||||
; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
|
||||
; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13
|
||||
; DAGISEL12-NEXT: ; %bb.2: ; %tail
|
||||
; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; DAGISEL12-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; DAGISEL12-NEXT: s_wait_alu 0xfffe
|
||||
; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
|
||||
;
|
||||
; GISEL10-LABEL: basic:
|
||||
; GISEL10: ; %bb.0: ; %entry
|
||||
; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GISEL10-NEXT: s_mov_b32 s8, s3
|
||||
; GISEL10-NEXT: s_mov_b32 s9, s4
|
||||
; GISEL10-NEXT: s_mov_b32 s4, s5
|
||||
; GISEL10-NEXT: s_mov_b32 s5, s6
|
||||
; GISEL10-NEXT: s_and_saveexec_b64 s[6:7], s[10:11]
|
||||
; GISEL10-NEXT: ; %bb.1: ; %shader
|
||||
; GISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
|
||||
; GISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
|
||||
; GISEL10-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GISEL10-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GISEL10-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GISEL10-NEXT: v_mov_b32_e32 v11, v0
|
||||
; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13
|
||||
; GISEL10-NEXT: v_mov_b32_e32 v12, v1
|
||||
; GISEL10-NEXT: ; %bb.2: ; %tail
|
||||
; GISEL10-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GISEL10-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL10-NEXT: s_setpc_b64 s[8:9]
|
||||
;
|
||||
; DAGISEL10-LABEL: basic:
|
||||
; DAGISEL10: ; %bb.0: ; %entry
|
||||
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; DAGISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; DAGISEL10-NEXT: s_mov_b32 s7, s6
|
||||
; DAGISEL10-NEXT: s_mov_b32 s6, s5
|
||||
; DAGISEL10-NEXT: s_mov_b32 s5, s4
|
||||
; DAGISEL10-NEXT: s_mov_b32 s4, s3
|
||||
; DAGISEL10-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
|
||||
; DAGISEL10-NEXT: ; %bb.1: ; %shader
|
||||
; DAGISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
|
||||
; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
|
||||
; DAGISEL10-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12
|
||||
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13
|
||||
; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13
|
||||
; DAGISEL10-NEXT: ; %bb.2: ; %tail
|
||||
; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; DAGISEL10-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
|
||||
entry:
|
||||
%entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
|
||||
br i1 %entry_exec, label %shader, label %tail
|
||||
|
||||
shader:
|
||||
%nonwwm = add i32 %x, 42
|
||||
%vgpr.1 = insertvalue { i32, ptr addrspace(5), i32, i64} %vgpr, i32 %nonwwm, 2
|
||||
|
||||
%full.vgpr = call i32 @llvm.amdgcn.set.inactive.i32(i32 %x, i32 71)
|
||||
%non.zero = icmp ne i32 %full.vgpr, 0
|
||||
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %non.zero)
|
||||
%wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %ballot)
|
||||
%vgpr.2 = insertvalue { i32, ptr addrspace(5), i32, i64} %vgpr.1, i64 %wwm, 3
|
||||
|
||||
br label %tail
|
||||
|
||||
tail:
|
||||
%vgpr.args = phi { i32, ptr addrspace(5), i32, i64} [%vgpr, %entry], [%vgpr.2, %shader]
|
||||
call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i64 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i64 } %vgpr.args, i32 0)
|
||||
unreachable
|
||||
}
|
||||
@ -10,7 +10,6 @@
|
||||
define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
|
||||
define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void}
|
||||
define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
|
||||
define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
|
||||
define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
|
||||
define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void}
|
||||
define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void}
|
||||
@ -134,34 +133,6 @@ body: |
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: dont_preserve_wwm_if_init_whole_wave
|
||||
tracksRegLiveness: true
|
||||
frameInfo:
|
||||
hasTailCall: true
|
||||
machineFunctionInfo:
|
||||
stackPtrOffsetReg: '$sgpr32'
|
||||
returnsVoid: true
|
||||
wwmReservedRegs:
|
||||
- '$vgpr8'
|
||||
- '$vgpr9'
|
||||
hasInitWholeWave: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
||||
|
||||
; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
|
||||
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
||||
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
||||
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
||||
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
||||
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
||||
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: dont_preserve_non_wwm
|
||||
tracksRegLiveness: true
|
||||
|
||||
@ -1,133 +0,0 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
# Test that we don't do silly things when there is no whole wave mode in the
|
||||
# shader (aka bb.1).
|
||||
#
|
||||
name: test_no_wwm
|
||||
alignment: 1
|
||||
exposesReturnsTwice: false
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
; CHECK-LABEL: name: test_no_wwm
|
||||
; CHECK: bb.0:
|
||||
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:ccr_sgpr_64 = COPY $sgpr1
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:ccr_sgpr_64 = COPY $sgpr2
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr8
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], [[S_OR_SAVEEXEC_B32_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]]
|
||||
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; CHECK-NEXT: S_BRANCH %bb.1
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.1:
|
||||
; CHECK-NEXT: successors: %bb.2(0x80000000)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 5, [[COPY2]], 0, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY3]], implicit-def $scc
|
||||
; CHECK-NEXT: $vgpr8 = COPY [[COPY2]]
|
||||
; CHECK-NEXT: $sgpr0 = COPY [[COPY]]
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; CHECK-NEXT: SI_CS_CHAIN_TC_W32 [[COPY1]], 0, 0, [[COPY4]], amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8
|
||||
%9:sreg_32 = COPY $sgpr0
|
||||
undef %1.sub0:ccr_sgpr_64 = COPY $sgpr1
|
||||
%1.sub1:ccr_sgpr_64 = COPY $sgpr2
|
||||
%37:vgpr_32 = COPY $vgpr8
|
||||
%14:sreg_32_xm0_xexec = SI_INIT_WHOLE_WAVE implicit-def $exec, implicit $exec
|
||||
%16:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
|
||||
%38:sreg_32 = S_AND_B32 %16:sreg_32_xm0_xexec, %14:sreg_32_xm0_xexec, implicit-def dead $scc
|
||||
$exec_lo = S_MOV_B32_term %38:sreg_32
|
||||
S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
%37:vgpr_32 = V_ADD_U32_e64 5, %37:vgpr_32, 0, implicit $exec
|
||||
|
||||
bb.2:
|
||||
$exec_lo = S_OR_B32 $exec_lo, %16:sreg_32_xm0_xexec, implicit-def $scc
|
||||
$vgpr8 = COPY %37:vgpr_32
|
||||
$sgpr0 = COPY %9:sreg_32
|
||||
%2:sreg_32 = COPY $sgpr0
|
||||
SI_CS_CHAIN_TC_W32 %1:ccr_sgpr_64, 0, 0, %2:sreg_32, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
||||
...
|
||||
|
||||
---
|
||||
# Test that we handle WWM in the shader correctly.
|
||||
#
|
||||
name: test_wwm_bb1
|
||||
alignment: 1
|
||||
exposesReturnsTwice: false
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
; CHECK-LABEL: name: test_wwm_bb1
|
||||
; CHECK: bb.0:
|
||||
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:ccr_sgpr_64 = COPY $sgpr1
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:ccr_sgpr_64 = COPY $sgpr2
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY4]], [[S_OR_SAVEEXEC_B32_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]]
|
||||
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; CHECK-NEXT: S_BRANCH %bb.1
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.1:
|
||||
; CHECK-NEXT: successors: %bb.2(0x80000000)
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 24, [[COPY3]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY3]], 71, implicit-def dead $scc, implicit $exec, implicit [[ENTER_STRICT_WWM]]
|
||||
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 42, [[V_SET_INACTIVE_B32_]], 0, implicit $exec
|
||||
; CHECK-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
|
||||
; CHECK-NEXT: early-clobber [[COPY2]]:vgpr_32 = V_MOV_B32_e32 [[V_ADD_U32_e64_]], implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY4]], implicit-def $scc
|
||||
; CHECK-NEXT: $vgpr8 = COPY [[COPY2]]
|
||||
; CHECK-NEXT: $vgpr9 = COPY [[COPY3]]
|
||||
; CHECK-NEXT: $sgpr0 = COPY [[COPY]]
|
||||
; CHECK-NEXT: SI_CS_CHAIN_TC_W32 [[COPY1]], 0, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9
|
||||
%9:sreg_32 = COPY $sgpr0
|
||||
undef %1.sub0:ccr_sgpr_64 = COPY $sgpr1
|
||||
%1.sub1:ccr_sgpr_64 = COPY $sgpr2
|
||||
%40:vgpr_32 = COPY $vgpr9
|
||||
%36:vgpr_32 = COPY $vgpr8
|
||||
%14:sreg_32_xm0_xexec = SI_INIT_WHOLE_WAVE implicit-def $exec, implicit $exec
|
||||
%16:sreg_32_xm0_xexec = COPY $exec_lo, implicit-def $exec_lo
|
||||
%38:sreg_32 = S_AND_B32 %16:sreg_32_xm0_xexec, %14:sreg_32_xm0_xexec, implicit-def dead $scc
|
||||
$exec_lo = S_MOV_B32_term %38:sreg_32
|
||||
S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
%36:vgpr_32 = V_ADD_U32_e64 24, %36:vgpr_32, 0, implicit $exec
|
||||
%19:vgpr_32 = V_SET_INACTIVE_B32 %36:vgpr_32, 71, implicit-def dead $scc, implicit $exec
|
||||
%18:vgpr_32 = V_ADD_U32_e64 42, %19:vgpr_32, 0, implicit $exec
|
||||
%40:vgpr_32 = STRICT_WWM %18:vgpr_32, implicit $exec
|
||||
|
||||
bb.2:
|
||||
$exec_lo = S_OR_B32 $exec_lo, %16:sreg_32_xm0_xexec, implicit-def $scc
|
||||
$vgpr8 = COPY %40:vgpr_32
|
||||
$vgpr9 = COPY %36:vgpr_32
|
||||
$sgpr0 = COPY %9:sreg_32
|
||||
SI_CS_CHAIN_TC_W32 %1:ccr_sgpr_64, 0, 0, %9:sreg_32, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
|
||||
...
|
||||
@ -42,7 +42,6 @@
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
|
||||
entry:
|
||||
@ -308,7 +307,6 @@
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
|
||||
entry:
|
||||
|
||||
@ -42,7 +42,6 @@
|
||||
; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
|
||||
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
|
||||
; AFTER-PEI-NEXT: longBranchReservedReg: ''
|
||||
; AFTER-PEI-NEXT: hasInitWholeWave: false
|
||||
; AFTER-PEI-NEXT: body:
|
||||
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
%wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0
|
||||
|
||||
@ -42,7 +42,6 @@
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
|
||||
bb0:
|
||||
|
||||
@ -42,7 +42,6 @@
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
|
||||
bb0:
|
||||
|
||||
@ -51,7 +51,6 @@
|
||||
# FULL-NEXT: vgprForAGPRCopy: ''
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: body:
|
||||
|
||||
# SIMPLE: machineFunctionInfo:
|
||||
@ -155,7 +154,6 @@ body: |
|
||||
# FULL-NEXT: vgprForAGPRCopy: ''
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: body:
|
||||
|
||||
# SIMPLE: machineFunctionInfo:
|
||||
@ -230,7 +228,6 @@ body: |
|
||||
# FULL-NEXT: vgprForAGPRCopy: ''
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: body:
|
||||
|
||||
# SIMPLE: machineFunctionInfo:
|
||||
@ -306,7 +303,6 @@ body: |
|
||||
# FULL-NEXT: vgprForAGPRCopy: ''
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: body:
|
||||
|
||||
# SIMPLE: machineFunctionInfo:
|
||||
|
||||
@ -51,7 +51,6 @@
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
||||
%gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
|
||||
@ -97,7 +96,6 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
|
||||
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
|
||||
@ -167,7 +165,6 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define void @function() {
|
||||
ret void
|
||||
@ -219,7 +216,6 @@ define void @function() {
|
||||
; CHECK-NEXT: vgprForAGPRCopy: ''
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: body:
|
||||
define void @function_nsz() #0 {
|
||||
ret void
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user