
Instead of lengthy constructors we can now set the members of a read-only struct before the Attributor is created. Should make it clearer what is configurable and also help introducing new options in the future. This actually added IsModulePass and avoids deduction through the Function set size. No functional change was intended.
779 lines
26 KiB
C++
779 lines
26 KiB
C++
//===- AMDGPUAttributor.cpp -----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/IntrinsicsR600.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/Transforms/IPO/Attributor.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-attributor"
|
|
|
|
using namespace llvm;
|
|
|
|
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
|
|
|
|
enum ImplicitArgumentPositions {
|
|
#include "AMDGPUAttributes.def"
|
|
LAST_ARG_POS
|
|
};
|
|
|
|
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
|
|
|
|
enum ImplicitArgumentMask {
|
|
NOT_IMPLICIT_INPUT = 0,
|
|
#include "AMDGPUAttributes.def"
|
|
ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
|
|
};
|
|
|
|
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
|
|
static constexpr std::pair<ImplicitArgumentMask,
|
|
StringLiteral> ImplicitAttrs[] = {
|
|
#include "AMDGPUAttributes.def"
|
|
};
|
|
|
|
// We do not need to note the x workitem or workgroup id because they are always
|
|
// initialized.
|
|
//
|
|
// TODO: We should not add the attributes if the known compile time workgroup
|
|
// size is 1 for y/z.
|
|
static ImplicitArgumentMask
|
|
intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
|
|
bool HasApertureRegs, bool SupportsGetDoorBellID) {
|
|
unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
|
|
switch (ID) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
NonKernelOnly = true;
|
|
return WORKITEM_ID_X;
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
NonKernelOnly = true;
|
|
return WORKGROUP_ID_X;
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
return WORKITEM_ID_Y;
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
return WORKITEM_ID_Z;
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
case Intrinsic::r600_read_tgid_y:
|
|
return WORKGROUP_ID_Y;
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
case Intrinsic::r600_read_tgid_z:
|
|
return WORKGROUP_ID_Z;
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
return DISPATCH_PTR;
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
return DISPATCH_ID;
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
return IMPLICIT_ARG_PTR;
|
|
// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
|
|
// queue_ptr.
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
NeedsImplicit = (CodeObjectVersion == 5);
|
|
return QUEUE_PTR;
|
|
case Intrinsic::amdgcn_is_shared:
|
|
case Intrinsic::amdgcn_is_private:
|
|
if (HasApertureRegs)
|
|
return NOT_IMPLICIT_INPUT;
|
|
// Under V5, we need implicitarg_ptr + offsets to access private_base or
|
|
// shared_base. For pre-V5, however, need to access them through queue_ptr +
|
|
// offsets.
|
|
return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
|
|
case Intrinsic::trap:
|
|
if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
|
|
return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
|
|
NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
|
|
return QUEUE_PTR;
|
|
default:
|
|
return NOT_IMPLICIT_INPUT;
|
|
}
|
|
}
|
|
|
|
static bool castRequiresQueuePtr(unsigned SrcAS) {
|
|
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
|
|
}
|
|
|
|
static bool isDSAddress(const Constant *C) {
|
|
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
|
|
if (!GV)
|
|
return false;
|
|
unsigned AS = GV->getAddressSpace();
|
|
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
|
|
}
|
|
|
|
/// Returns true if the function requires the implicit argument be passed
|
|
/// regardless of the function contents.
|
|
static bool funcRequiresHostcallPtr(const Function &F) {
|
|
// Sanitizers require the hostcall buffer passed in the implicit arguments.
|
|
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
|
|
F.hasFnAttribute(Attribute::SanitizeThread) ||
|
|
F.hasFnAttribute(Attribute::SanitizeMemory) ||
|
|
F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
|
|
F.hasFnAttribute(Attribute::SanitizeMemTag);
|
|
}
|
|
|
|
namespace {
|
|
class AMDGPUInformationCache : public InformationCache {
|
|
public:
|
|
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
|
|
BumpPtrAllocator &Allocator,
|
|
SetVector<Function *> *CGSCC, TargetMachine &TM)
|
|
: InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
|
|
TargetMachine &TM;
|
|
|
|
enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
|
|
|
|
/// Check if the subtarget has aperture regs.
|
|
bool hasApertureRegs(Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.hasApertureRegs();
|
|
}
|
|
|
|
/// Check if the subtarget supports GetDoorbellID.
|
|
bool supportsGetDoorbellID(Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.supportsGetDoorbellID();
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return ST.getFlatWorkGroupSizes(F);
|
|
}
|
|
|
|
std::pair<unsigned, unsigned>
|
|
getMaximumFlatWorkGroupRange(const Function &F) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
|
|
}
|
|
|
|
private:
|
|
/// Check if the ConstantExpr \p CE requires the queue pointer.
|
|
static bool visitConstExpr(const ConstantExpr *CE) {
|
|
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
|
|
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
|
|
return castRequiresQueuePtr(SrcAS);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Get the constant access bitmap for \p C.
|
|
uint8_t getConstantAccess(const Constant *C) {
|
|
auto It = ConstantStatus.find(C);
|
|
if (It != ConstantStatus.end())
|
|
return It->second;
|
|
|
|
uint8_t Result = 0;
|
|
if (isDSAddress(C))
|
|
Result = DS_GLOBAL;
|
|
|
|
if (const auto *CE = dyn_cast<ConstantExpr>(C))
|
|
if (visitConstExpr(CE))
|
|
Result |= ADDR_SPACE_CAST;
|
|
|
|
for (const Use &U : C->operands()) {
|
|
const auto *OpC = dyn_cast<Constant>(U);
|
|
if (!OpC)
|
|
continue;
|
|
|
|
Result |= getConstantAccess(OpC);
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
public:
|
|
/// Returns true if \p Fn needs the queue pointer because of \p C.
|
|
bool needsQueuePtr(const Constant *C, Function &Fn) {
|
|
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
|
|
bool HasAperture = hasApertureRegs(Fn);
|
|
|
|
// No need to explore the constants.
|
|
if (!IsNonEntryFunc && HasAperture)
|
|
return false;
|
|
|
|
uint8_t Access = getConstantAccess(C);
|
|
|
|
// We need to trap on DS globals in non-entry functions.
|
|
if (IsNonEntryFunc && (Access & DS_GLOBAL))
|
|
return true;
|
|
|
|
return !HasAperture && (Access & ADDR_SPACE_CAST);
|
|
}
|
|
|
|
private:
|
|
/// Used to determine if the Constant needs the queue pointer.
|
|
DenseMap<const Constant *, uint8_t> ConstantStatus;
|
|
};
|
|
|
|
struct AAAMDAttributes : public StateWrapper<
|
|
BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> {
|
|
using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>,
|
|
AbstractAttribute>;
|
|
|
|
AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDAttributes &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
/// See AbstractAttribute::getName().
|
|
const std::string getName() const override { return "AAAMDAttributes"; }
|
|
|
|
/// See AbstractAttribute::getIdAddr().
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDAttributes.
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
const char AAAMDAttributes::ID = 0;
|
|
|
|
struct AAUniformWorkGroupSize
|
|
: public StateWrapper<BooleanState, AbstractAttribute> {
|
|
using Base = StateWrapper<BooleanState, AbstractAttribute>;
|
|
AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
/// See AbstractAttribute::getName().
|
|
const std::string getName() const override {
|
|
return "AAUniformWorkGroupSize";
|
|
}
|
|
|
|
/// See AbstractAttribute::getIdAddr().
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDAttributes.
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
const char AAUniformWorkGroupSize::ID = 0;
|
|
|
|
struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
|
|
AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
|
|
: AAUniformWorkGroupSize(IRP, A) {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
CallingConv::ID CC = F->getCallingConv();
|
|
|
|
if (CC != CallingConv::AMDGPU_KERNEL)
|
|
return;
|
|
|
|
bool InitialValue = false;
|
|
if (F->hasFnAttribute("uniform-work-group-size"))
|
|
InitialValue = F->getFnAttribute("uniform-work-group-size")
|
|
.getValueAsString()
|
|
.equals("true");
|
|
|
|
if (InitialValue)
|
|
indicateOptimisticFixpoint();
|
|
else
|
|
indicatePessimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
ChangeStatus Change = ChangeStatus::UNCHANGED;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
Function *Caller = CS.getInstruction()->getFunction();
|
|
LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
|
|
<< "->" << getAssociatedFunction()->getName() << "\n");
|
|
|
|
const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
|
|
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
|
|
|
|
Change = Change | clampStateAndIndicateChange(this->getState(),
|
|
CallerInfo.getState());
|
|
|
|
return true;
|
|
};
|
|
|
|
bool AllCallSitesKnown = true;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return Change;
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
SmallVector<Attribute, 8> AttrList;
|
|
LLVMContext &Ctx = getAssociatedFunction()->getContext();
|
|
|
|
AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
|
|
getAssumed() ? "true" : "false"));
|
|
return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
|
|
/* ForceReplace */ true);
|
|
}
|
|
|
|
bool isValidState() const override {
|
|
// This state is always valid, even when the state is false.
|
|
return true;
|
|
}
|
|
|
|
const std::string getAsStr() const override {
|
|
return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
|
|
}
|
|
|
|
/// See AbstractAttribute::trackStatistics()
|
|
void trackStatistics() const override {}
|
|
};
|
|
|
|
AAUniformWorkGroupSize &
|
|
AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
|
|
llvm_unreachable(
|
|
"AAUniformWorkGroupSize is only valid for function position");
|
|
}
|
|
|
|
struct AAAMDAttributesFunction : public AAAMDAttributes {
|
|
AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
|
|
: AAAMDAttributes(IRP, A) {}
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
|
|
// If the function requires the implicit arg pointer due to sanitizers,
|
|
// assume it's needed even if explicitly marked as not requiring it.
|
|
const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
|
|
if (NeedsHostcall) {
|
|
removeAssumedBits(IMPLICIT_ARG_PTR);
|
|
removeAssumedBits(HOSTCALL_PTR);
|
|
}
|
|
|
|
for (auto Attr : ImplicitAttrs) {
|
|
if (NeedsHostcall &&
|
|
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
|
|
continue;
|
|
|
|
if (F->hasFnAttribute(Attr.second))
|
|
addKnownBits(Attr.first);
|
|
}
|
|
|
|
if (F->isDeclaration())
|
|
return;
|
|
|
|
// Ignore functions with graphics calling conventions, these are currently
|
|
// not allowed to have kernel arguments.
|
|
if (AMDGPU::isGraphics(F->getCallingConv())) {
|
|
indicatePessimisticFixpoint();
|
|
return;
|
|
}
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
// The current assumed state used to determine a change.
|
|
auto OrigAssumed = getAssumed();
|
|
|
|
// Check for Intrinsics and propagate attributes.
|
|
const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
|
|
*this, this->getIRPosition(), DepClassTy::REQUIRED);
|
|
if (AAEdges.hasNonAsmUnknownCallee())
|
|
return indicatePessimisticFixpoint();
|
|
|
|
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
|
|
|
|
bool NeedsImplicit = false;
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
|
|
bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
|
|
|
|
for (Function *Callee : AAEdges.getOptimisticEdges()) {
|
|
Intrinsic::ID IID = Callee->getIntrinsicID();
|
|
if (IID == Intrinsic::not_intrinsic) {
|
|
const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
|
|
*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
|
|
*this &= AAAMD;
|
|
continue;
|
|
}
|
|
|
|
bool NonKernelOnly = false;
|
|
ImplicitArgumentMask AttrMask =
|
|
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
|
|
HasApertureRegs, SupportsGetDoorbellID);
|
|
if (AttrMask != NOT_IMPLICIT_INPUT) {
|
|
if ((IsNonEntryFunc || !NonKernelOnly))
|
|
removeAssumedBits(AttrMask);
|
|
}
|
|
}
|
|
|
|
// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
|
|
if (NeedsImplicit)
|
|
removeAssumedBits(IMPLICIT_ARG_PTR);
|
|
|
|
if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
|
|
// Under V5, we need implicitarg_ptr + offsets to access private_base or
|
|
// shared_base. We do not actually need queue_ptr.
|
|
if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
|
|
removeAssumedBits(IMPLICIT_ARG_PTR);
|
|
else
|
|
removeAssumedBits(QUEUE_PTR);
|
|
}
|
|
|
|
if (funcRetrievesMultigridSyncArg(A)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) &&
|
|
"multigrid_sync_arg needs implicitarg_ptr");
|
|
removeAssumedBits(MULTIGRID_SYNC_ARG);
|
|
}
|
|
|
|
if (funcRetrievesHostcallPtr(A)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
|
|
removeAssumedBits(HOSTCALL_PTR);
|
|
}
|
|
|
|
if (funcRetrievesHeapPtr(A)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
|
|
removeAssumedBits(HEAP_PTR);
|
|
}
|
|
|
|
if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
|
|
assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
|
|
removeAssumedBits(QUEUE_PTR);
|
|
}
|
|
|
|
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
|
|
: ChangeStatus::UNCHANGED;
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
SmallVector<Attribute, 8> AttrList;
|
|
LLVMContext &Ctx = getAssociatedFunction()->getContext();
|
|
|
|
for (auto Attr : ImplicitAttrs) {
|
|
if (isKnown(Attr.first))
|
|
AttrList.push_back(Attribute::get(Ctx, Attr.second));
|
|
}
|
|
|
|
return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
|
|
/* ForceReplace */ true);
|
|
}
|
|
|
|
const std::string getAsStr() const override {
|
|
std::string Str;
|
|
raw_string_ostream OS(Str);
|
|
OS << "AMDInfo[";
|
|
for (auto Attr : ImplicitAttrs)
|
|
OS << ' ' << Attr.second;
|
|
OS << " ]";
|
|
return OS.str();
|
|
}
|
|
|
|
/// See AbstractAttribute::trackStatistics()
|
|
void trackStatistics() const override {}
|
|
|
|
private:
|
|
bool checkForQueuePtr(Attributor &A) {
|
|
Function *F = getAssociatedFunction();
|
|
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
|
|
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
|
|
bool NeedsQueuePtr = false;
|
|
|
|
auto CheckAddrSpaceCasts = [&](Instruction &I) {
|
|
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
|
|
if (castRequiresQueuePtr(SrcAS)) {
|
|
NeedsQueuePtr = true;
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
|
|
|
|
// `checkForAllInstructions` is much more cheaper than going through all
|
|
// instructions, try it first.
|
|
|
|
// The queue pointer is not needed if aperture regs is present.
|
|
if (!HasApertureRegs) {
|
|
bool UsedAssumedInformation = false;
|
|
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
|
|
{Instruction::AddrSpaceCast},
|
|
UsedAssumedInformation);
|
|
}
|
|
|
|
// If we found that we need the queue pointer, nothing else to do.
|
|
if (NeedsQueuePtr)
|
|
return true;
|
|
|
|
if (!IsNonEntryFunc && HasApertureRegs)
|
|
return false;
|
|
|
|
for (BasicBlock &BB : *F) {
|
|
for (Instruction &I : BB) {
|
|
for (const Use &U : I.operands()) {
|
|
if (const auto *C = dyn_cast<Constant>(U)) {
|
|
if (InfoCache.needsQueuePtr(C, *F))
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool funcRetrievesMultigridSyncArg(Attributor &A) {
|
|
auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
|
|
AAPointerInfo::OffsetAndSize OAS(Pos, 8);
|
|
return funcRetrievesImplicitKernelArg(A, OAS);
|
|
}
|
|
|
|
bool funcRetrievesHostcallPtr(Attributor &A) {
|
|
auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
|
|
AAPointerInfo::OffsetAndSize OAS(Pos, 8);
|
|
return funcRetrievesImplicitKernelArg(A, OAS);
|
|
}
|
|
|
|
bool funcRetrievesHeapPtr(Attributor &A) {
|
|
if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
|
|
return false;
|
|
AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
|
|
return funcRetrievesImplicitKernelArg(A, OAS);
|
|
}
|
|
|
|
bool funcRetrievesQueuePtr(Attributor &A) {
|
|
if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
|
|
return false;
|
|
AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
|
|
return funcRetrievesImplicitKernelArg(A, OAS);
|
|
}
|
|
|
|
bool funcRetrievesImplicitKernelArg(Attributor &A,
|
|
AAPointerInfo::OffsetAndSize OAS) {
|
|
// Check if this is a call to the implicitarg_ptr builtin and it
|
|
// is used to retrieve the hostcall pointer. The implicit arg for
|
|
// hostcall is not used only if every use of the implicitarg_ptr
|
|
// is a load that clearly does not retrieve any byte of the
|
|
// hostcall pointer. We check this by tracing all the uses of the
|
|
// initial call to the implicitarg_ptr intrinsic.
|
|
auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
|
|
auto &Call = cast<CallBase>(I);
|
|
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
|
|
return true;
|
|
|
|
const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
|
|
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
|
|
|
|
return PointerInfoAA.forallInterferingAccesses(
|
|
OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
|
|
return Acc.getRemoteInst()->isDroppable();
|
|
});
|
|
};
|
|
|
|
bool UsedAssumedInformation = false;
|
|
return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
|
|
UsedAssumedInformation);
|
|
}
|
|
};
|
|
|
|
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
|
|
llvm_unreachable("AAAMDAttributes is only valid for function position");
|
|
}
|
|
|
|
/// Propagate amdgpu-flat-work-group-size attribute.
|
|
struct AAAMDFlatWorkGroupSize
|
|
: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
|
|
using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
|
|
AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
|
|
: Base(IRP, 32) {}
|
|
|
|
/// See AbstractAttribute::getState(...).
|
|
IntegerRangeState &getState() override { return *this; }
|
|
const IntegerRangeState &getState() const override { return *this; }
|
|
|
|
void initialize(Attributor &A) override {
|
|
Function *F = getAssociatedFunction();
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
unsigned MinGroupSize, MaxGroupSize;
|
|
std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
|
|
intersectKnown(
|
|
ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
|
|
|
|
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
|
|
indicatePessimisticFixpoint();
|
|
}
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
ChangeStatus Change = ChangeStatus::UNCHANGED;
|
|
|
|
auto CheckCallSite = [&](AbstractCallSite CS) {
|
|
Function *Caller = CS.getInstruction()->getFunction();
|
|
LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
|
|
<< "->" << getAssociatedFunction()->getName() << '\n');
|
|
|
|
const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
|
|
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
|
|
|
|
Change |=
|
|
clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
|
|
|
|
return true;
|
|
};
|
|
|
|
bool AllCallSitesKnown = true;
|
|
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
|
|
return indicatePessimisticFixpoint();
|
|
|
|
return Change;
|
|
}
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
SmallVector<Attribute, 8> AttrList;
|
|
Function *F = getAssociatedFunction();
|
|
LLVMContext &Ctx = F->getContext();
|
|
|
|
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
|
|
unsigned Min, Max;
|
|
std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
|
|
|
|
// Don't add the attribute if it's the implied default.
|
|
if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
SmallString<10> Buffer;
|
|
raw_svector_ostream OS(Buffer);
|
|
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
|
|
|
|
AttrList.push_back(
|
|
Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
|
|
return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
|
|
/* ForceReplace */ true);
|
|
}
|
|
|
|
const std::string getAsStr() const override {
|
|
std::string Str;
|
|
raw_string_ostream OS(Str);
|
|
OS << "AMDFlatWorkGroupSize[";
|
|
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
|
|
OS << ']';
|
|
return OS.str();
|
|
}
|
|
|
|
/// See AbstractAttribute::trackStatistics()
|
|
void trackStatistics() const override {}
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
|
|
Attributor &A);
|
|
|
|
/// See AbstractAttribute::getName()
|
|
const std::string getName() const override {
|
|
return "AAAMDFlatWorkGroupSize";
|
|
}
|
|
|
|
/// See AbstractAttribute::getIdAddr()
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
/// AAAMDFlatWorkGroupSize
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
return (AA->getIdAddr() == &ID);
|
|
}
|
|
|
|
/// Unique ID (due to the unique address)
|
|
static const char ID;
|
|
};
|
|
|
|
const char AAAMDFlatWorkGroupSize::ID = 0;
|
|
|
|
AAAMDFlatWorkGroupSize &
|
|
AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
|
|
Attributor &A) {
|
|
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
|
|
return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
|
|
llvm_unreachable(
|
|
"AAAMDFlatWorkGroupSize is only valid for function position");
|
|
}
|
|
|
|
class AMDGPUAttributor : public ModulePass {
|
|
public:
|
|
AMDGPUAttributor() : ModulePass(ID) {}
|
|
|
|
/// doInitialization - Virtual method overridden by subclasses to do
|
|
/// any necessary initialization before any pass is run.
|
|
bool doInitialization(Module &) override {
|
|
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
|
if (!TPC)
|
|
report_fatal_error("TargetMachine is required");
|
|
|
|
TM = &TPC->getTM<TargetMachine>();
|
|
return false;
|
|
}
|
|
|
|
bool runOnModule(Module &M) override {
|
|
SetVector<Function *> Functions;
|
|
AnalysisGetter AG;
|
|
for (Function &F : M) {
|
|
if (!F.isIntrinsic())
|
|
Functions.insert(&F);
|
|
}
|
|
|
|
CallGraphUpdater CGUpdater;
|
|
BumpPtrAllocator Allocator;
|
|
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
|
|
DenseSet<const char *> Allowed(
|
|
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
|
|
&AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
|
|
|
|
AttributorConfig AC(CGUpdater);
|
|
AC.Allowed = &Allowed;
|
|
AC.IsModulePass = true;
|
|
AC.DefaultInitializeLiveInternals = false;
|
|
|
|
Attributor A(Functions, InfoCache, AC);
|
|
|
|
for (Function &F : M) {
|
|
if (!F.isIntrinsic()) {
|
|
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
|
|
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
|
|
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
|
|
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
|
|
}
|
|
}
|
|
}
|
|
|
|
ChangeStatus Change = A.run();
|
|
return Change == ChangeStatus::CHANGED;
|
|
}
|
|
|
|
StringRef getPassName() const override { return "AMDGPU Attributor"; }
|
|
TargetMachine *TM;
|
|
static char ID;
|
|
};
|
|
} // namespace
|
|
|
|
char AMDGPUAttributor::ID = 0;
|
|
|
|
Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
|
|
INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
|