
Performance testing shows no significant gains or losses on graphics workloads, so this is mostly to make the behavior consistent across all supported OSes instead of special-casing HSA.
243 lines
7.5 KiB
C++
243 lines
7.5 KiB
C++
//===-- SIProgramInfo.cpp ----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
///
|
|
/// The SIProgramInfo tracks resource usage and hardware flags for kernels and
|
|
/// entry functions.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
|
|
#include "SIProgramInfo.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "SIDefines.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/MC/MCExpr.h"
|
|
|
|
using namespace llvm;
|
|
|
|
void SIProgramInfo::reset(const MachineFunction &MF) {
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
|
|
|
|
CodeSizeInBytes.reset();
|
|
|
|
VGPRBlocks = ZeroExpr;
|
|
SGPRBlocks = ZeroExpr;
|
|
Priority = 0;
|
|
FloatMode = 0;
|
|
Priv = 0;
|
|
DX10Clamp = 0;
|
|
DebugMode = 0;
|
|
IEEEMode = 0;
|
|
WgpMode = 0;
|
|
MemOrdered = 0;
|
|
FwdProgress = 0;
|
|
RrWgMode = 0;
|
|
ScratchSize = ZeroExpr;
|
|
|
|
LDSBlocks = 0;
|
|
ScratchBlocks = ZeroExpr;
|
|
|
|
ScratchEnable = ZeroExpr;
|
|
UserSGPR = 0;
|
|
TrapHandlerEnable = 0;
|
|
TGIdXEnable = 0;
|
|
TGIdYEnable = 0;
|
|
TGIdZEnable = 0;
|
|
TGSizeEnable = 0;
|
|
TIdIGCompCount = 0;
|
|
EXCPEnMSB = 0;
|
|
LdsSize = 0;
|
|
EXCPEnable = 0;
|
|
|
|
ComputePGMRSrc3 = ZeroExpr;
|
|
|
|
NumVGPR = ZeroExpr;
|
|
NumArchVGPR = ZeroExpr;
|
|
NumAccVGPR = ZeroExpr;
|
|
AccumOffset = ZeroExpr;
|
|
TgSplit = 0;
|
|
NumSGPR = ZeroExpr;
|
|
SGPRSpill = 0;
|
|
VGPRSpill = 0;
|
|
LDSSize = 0;
|
|
FlatUsed = ZeroExpr;
|
|
|
|
NumSGPRsForWavesPerEU = ZeroExpr;
|
|
NumVGPRsForWavesPerEU = ZeroExpr;
|
|
Occupancy = ZeroExpr;
|
|
DynamicCallStack = ZeroExpr;
|
|
VCCUsed = ZeroExpr;
|
|
}
|
|
|
|
static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
|
|
const GCNSubtarget &ST) {
|
|
uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
|
|
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
|
|
S_00B848_PRIV(ProgInfo.Priv) |
|
|
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
|
|
S_00B848_WGP_MODE(ProgInfo.WgpMode) |
|
|
S_00B848_MEM_ORDERED(ProgInfo.MemOrdered) |
|
|
S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
|
|
|
|
if (ST.hasDX10ClampMode())
|
|
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
|
|
|
|
if (ST.hasIEEEMode())
|
|
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
|
|
|
|
if (ST.hasRrWGMode())
|
|
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
|
|
|
|
return Reg;
|
|
}
|
|
|
|
static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
|
|
CallingConv::ID CC, const GCNSubtarget &ST) {
|
|
uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
|
|
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
|
|
S_00B848_PRIV(ProgInfo.Priv) |
|
|
S_00B848_DEBUG_MODE(ProgInfo.DebugMode);
|
|
|
|
if (ST.hasDX10ClampMode())
|
|
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
|
|
|
|
if (ST.hasIEEEMode())
|
|
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
|
|
|
|
if (ST.hasRrWGMode())
|
|
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
|
|
|
|
switch (CC) {
|
|
case CallingConv::AMDGPU_PS:
|
|
Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered);
|
|
break;
|
|
case CallingConv::AMDGPU_VS:
|
|
Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered);
|
|
break;
|
|
case CallingConv::AMDGPU_GS:
|
|
Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) |
|
|
S_00B228_MEM_ORDERED(ProgInfo.MemOrdered);
|
|
break;
|
|
case CallingConv::AMDGPU_HS:
|
|
Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) |
|
|
S_00B428_MEM_ORDERED(ProgInfo.MemOrdered);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return Reg;
|
|
}
|
|
|
|
static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
|
|
uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
|
|
S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
|
|
S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
|
|
S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
|
|
S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
|
|
S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
|
|
S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
|
|
S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
|
|
S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
|
|
S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
|
|
|
|
return Reg;
|
|
}
|
|
|
|
static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
|
|
MCContext &Ctx) {
|
|
if (Mask) {
|
|
const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
|
|
Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
|
|
}
|
|
if (Shift) {
|
|
const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
|
|
Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
|
|
}
|
|
return Val;
|
|
}
|
|
|
|
const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
|
|
MCContext &Ctx) const {
|
|
uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
|
|
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
|
|
const MCExpr *Res = MCBinaryExpr::createOr(
|
|
MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
|
|
MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
|
|
return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
|
|
}
|
|
|
|
const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
|
|
const GCNSubtarget &ST,
|
|
MCContext &Ctx) const {
|
|
if (AMDGPU::isCompute(CC)) {
|
|
return getComputePGMRSrc1(ST, Ctx);
|
|
}
|
|
|
|
uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST);
|
|
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
|
|
const MCExpr *Res = MCBinaryExpr::createOr(
|
|
MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
|
|
MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
|
|
return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
|
|
}
|
|
|
|
const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
|
|
uint64_t Reg = getComputePGMRSrc2Reg(*this);
|
|
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
|
|
return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
|
|
}
|
|
|
|
const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
|
|
MCContext &Ctx) const {
|
|
if (AMDGPU::isCompute(CC))
|
|
return getComputePGMRSrc2(Ctx);
|
|
|
|
return MCConstantExpr::create(0, Ctx);
|
|
}
|
|
|
|
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
|
|
bool IsLowerBound) {
|
|
if (!IsLowerBound && CodeSizeInBytes.has_value())
|
|
return *CodeSizeInBytes;
|
|
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = STM.getInstrInfo();
|
|
|
|
uint64_t CodeSize = 0;
|
|
|
|
for (const MachineBasicBlock &MBB : MF) {
|
|
// The amount of padding to align code can be both underestimated and
|
|
// overestimated. In case of inline asm used getInstSizeInBytes() will
|
|
// return a maximum size of a single instruction, where the real size may
|
|
// differ. At this point CodeSize may be already off.
|
|
if (!IsLowerBound)
|
|
CodeSize = alignTo(CodeSize, MBB.getAlignment());
|
|
|
|
for (const MachineInstr &MI : MBB) {
|
|
// TODO: CodeSize should account for multiple functions.
|
|
|
|
if (MI.isMetaInstruction())
|
|
continue;
|
|
|
|
// We cannot properly estimate inline asm size. It can be as small as zero
|
|
// if that is just a comment.
|
|
if (IsLowerBound && MI.isInlineAsm())
|
|
continue;
|
|
|
|
CodeSize += TII->getInstSizeInBytes(MI);
|
|
}
|
|
}
|
|
|
|
CodeSizeInBytes = CodeSize;
|
|
return CodeSize;
|
|
}
|