
This reverts commit 0954c9d487e7cb30673df9f0ac125f71320d2936. It breaks the build when built with gcc version 11.4.0 (Ubuntu 11.4.0-1ubuntu1~22.04).
2569 lines
98 KiB
C++
2569 lines
98 KiB
C++
//===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass lowers operations on buffer fat pointers (addrspace 7) to
|
|
// operations on buffer resources (addrspace 8) and is needed for correct
|
|
// codegen.
|
|
//
|
|
// # Background
|
|
//
|
|
// Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists
|
|
// of a 128-bit buffer descriptor and a 32-bit offset into that descriptor.
|
|
// The buffer resource part needs to be it needs to be a "raw" buffer resource
|
|
// (it must have a stride of 0 and bounds checks must be in raw buffer mode
|
|
// or disabled).
|
|
//
|
|
// When these requirements are met, a buffer resource can be treated as a
|
|
// typical (though quite wide) pointer that follows typical LLVM pointer
|
|
// semantics. This allows the frontend to reason about such buffers (which are
|
|
// often encountered in the context of SPIR-V kernels).
|
|
//
|
|
// However, because of their non-power-of-2 size, these fat pointers cannot be
|
|
// present during translation to MIR (though this restriction may be lifted
|
|
// during the transition to GlobalISel). Therefore, this pass is needed in order
|
|
// to correctly implement these fat pointers.
|
|
//
|
|
// The resource intrinsics take the resource part (the address space 8 pointer)
|
|
// and the offset part (the 32-bit integer) as separate arguments. In addition,
|
|
// many users of these buffers manipulate the offset while leaving the resource
|
|
// part alone. For these reasons, we want to typically separate the resource
|
|
// and offset parts into separate variables, but combine them together when
|
|
// encountering cases where this is required, such as by inserting these values
|
|
// into aggretates or moving them to memory.
|
|
//
|
|
// Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8)
|
|
// %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8),
|
|
// i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector<Nxp7>` becomes
|
|
// `{vector<Nxp8>, vector<Nxi32 >}` and its component parts.
|
|
//
|
|
// # Implementation
|
|
//
|
|
// This pass proceeds in three main phases:
|
|
//
|
|
// ## Rewriting loads and stores of p7 and memcpy()-like handling
|
|
//
|
|
// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
|
|
// including aggregates containing such pointers, to ones that use `i160`. This
|
|
// is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits
|
|
// loads, stores, and allocas and, if the loaded or stored type contains `ptr
|
|
// addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
|
|
// copying other parts of aggregates as needed. In the case of a store, each
|
|
// pointer is `ptrtoint`d to i160 before storing, and load integers are
|
|
// `inttoptr`d back. This same transformation is applied to vectors of pointers.
|
|
//
|
|
// Such a transformation allows the later phases of the pass to not need
|
|
// to handle buffer fat pointers moving to and from memory, where we load
|
|
// have to handle the incompatibility between a `{Nxp8, Nxi32}` representation
|
|
// and `Nxi60` directly. Instead, that transposing action (where the vectors
|
|
// of resources and vectors of offsets are concatentated before being stored to
|
|
// memory) are handled through implementing `inttoptr` and `ptrtoint` only.
|
|
//
|
|
// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
|
|
// hardware does not include a 160-bit atomic.
|
|
//
|
|
// In order to save on O(N) work and to ensure that the contents type
|
|
// legalizer correctly splits up wide loads, also unconditionally lower
|
|
// memcpy-like intrinsics into loops here.
|
|
//
|
|
// ## Buffer contents type legalization
|
|
//
|
|
// The underlying buffer intrinsics only support types up to 128 bits long,
|
|
// and don't support complex types. If buffer operations were
|
|
// standard pointer operations that could be represented as MIR-level loads,
|
|
// this would be handled by the various legalization schemes in instruction
|
|
// selection. However, because we have to do the conversion from `load` and
|
|
// `store` to intrinsics at LLVM IR level, we must perform that legalization
|
|
// ourselves.
|
|
//
|
|
// This involves a combination of
|
|
// - Converting arrays to vectors where possible
|
|
// - Otherwise, splitting loads and stores of aggregates into loads/stores of
|
|
// each component.
|
|
// - Zero-extending things to fill a whole number of bytes
|
|
// - Casting values of types that don't neatly correspond to supported machine
|
|
// value
|
|
// (for example, an i96 or i256) into ones that would work (
|
|
// like <3 x i32> and <8 x i32>, respectively)
|
|
// - Splitting values that are too long (such as aforementioned <8 x i32>) into
|
|
// multiple operations.
|
|
//
|
|
// ## Type remapping
|
|
//
|
|
// We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
|
|
// to the corresponding struct type, which has a resource part and an offset
|
|
// part.
|
|
//
|
|
// This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer`
|
|
// to, usually by way of `setType`ing values. Constants are handled here
|
|
// because there isn't a good way to fix them up later.
|
|
//
|
|
// This has the downside of leaving the IR in an invalid state (for example,
|
|
// the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist),
|
|
// but all such invalid states will be resolved by the third phase.
|
|
//
|
|
// Functions that don't take buffer fat pointers are modified in place. Those
|
|
// that do take such pointers have their basic blocks moved to a new function
|
|
// with arguments that are {ptr addrspace(8), i32} arguments and return values.
|
|
// This phase also records intrinsics so that they can be remangled or deleted
|
|
// later.
|
|
//
|
|
// ## Splitting pointer structs
|
|
//
|
|
// The meat of this pass consists of defining semantics for operations that
|
|
// produce or consume [vectors of] buffer fat pointers in terms of their
|
|
// resource and offset parts. This is accomplished throgh the `SplitPtrStructs`
|
|
// visitor.
|
|
//
|
|
// In the first pass through each function that is being lowered, the splitter
|
|
// inserts new instructions to implement the split-structures behavior, which is
|
|
// needed for correctness and performance. It records a list of "split users",
|
|
// instructions that are being replaced by operations on the resource and offset
|
|
// parts.
|
|
//
|
|
// Split users do not necessarily need to produce parts themselves (
|
|
// a `load float, ptr addrspace(7)` does not, for example), but, if they do not
|
|
// generate fat buffer pointers, they must RAUW in their replacement
|
|
// instructions during the initial visit.
|
|
//
|
|
// When these new instructions are created, they use the split parts recorded
|
|
// for their initial arguments in order to generate their replacements, creating
|
|
// a parallel set of instructions that does not refer to the original fat
|
|
// pointer values but instead to their resource and offset components.
|
|
//
|
|
// Instructions, such as `extractvalue`, that produce buffer fat pointers from
|
|
// sources that do not have split parts, have such parts generated using
|
|
// `extractvalue`. This is also the initial handling of PHI nodes, which
|
|
// are then cleaned up.
|
|
//
|
|
// ### Conditionals
|
|
//
|
|
// PHI nodes are initially given resource parts via `extractvalue`. However,
|
|
// this is not an efficient rewrite of such nodes, as, in most cases, the
|
|
// resource part in a conditional or loop remains constant throughout the loop
|
|
// and only the offset varies. Failing to optimize away these constant resources
|
|
// would cause additional registers to be sent around loops and might lead to
|
|
// waterfall loops being generated for buffer operations due to the
|
|
// "non-uniform" resource argument.
|
|
//
|
|
// Therefore, after all instructions have been visited, the pointer splitter
|
|
// post-processes all encountered conditionals. Given a PHI node or select,
|
|
// getPossibleRsrcRoots() collects all values that the resource parts of that
|
|
// conditional's input could come from as well as collecting all conditional
|
|
// instructions encountered during the search. If, after filtering out the
|
|
// initial node itself, the set of encountered conditionals is a subset of the
|
|
// potential roots and there is a single potential resource that isn't in the
|
|
// conditional set, that value is the only possible value the resource argument
|
|
// could have throughout the control flow.
|
|
//
|
|
// If that condition is met, then a PHI node can have its resource part changed
|
|
// to the singleton value and then be replaced by a PHI on the offsets.
|
|
// Otherwise, each PHI node is split into two, one for the resource part and one
|
|
// for the offset part, which replace the temporary `extractvalue` instructions
|
|
// that were added during the first pass.
|
|
//
|
|
// Similar logic applies to `select`, where
|
|
// `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y`
|
|
// can be split into `%z.rsrc = %x.rsrc` and
|
|
// `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off`
|
|
// if both `%x` and `%y` have the same resource part, but two `select`
|
|
// operations will be needed if they do not.
|
|
//
|
|
// ### Final processing
|
|
//
|
|
// After conditionals have been cleaned up, the IR for each function is
|
|
// rewritten to remove all the old instructions that have been split up.
|
|
//
|
|
// Any instruction that used to produce a buffer fat pointer (and therefore now
|
|
// produces a resource-and-offset struct after type remapping) is
|
|
// replaced as follows:
|
|
// 1. All debug value annotations are cloned to reflect that the resource part
|
|
// and offset parts are computed separately and constitute different
|
|
// fragments of the underlying source language variable.
|
|
// 2. All uses that were themselves split are replaced by a `poison` of the
|
|
// struct type, as they will themselves be erased soon. This rule, combined
|
|
// with debug handling, should leave the use lists of split instructions
|
|
// empty in almost all cases.
|
|
// 3. If a user of the original struct-valued result remains, the structure
|
|
// needed for the new types to work is constructed out of the newly-defined
|
|
// parts, and the original instruction is replaced by this structure
|
|
// before being erased. Instructions requiring this construction include
|
|
// `ret` and `insertvalue`.
|
|
//
|
|
// # Consequences
|
|
//
|
|
// This pass does not alter the CFG.
|
|
//
|
|
// Alias analysis information will become coarser, as the LLVM alias analyzer
|
|
// cannot handle the buffer intrinsics. Specifically, while we can determine
|
|
// that the following two loads do not alias:
|
|
// ```
|
|
// %y = getelementptr i32, ptr addrspace(7) %x, i32 1
|
|
// %a = load i32, ptr addrspace(7) %x
|
|
// %b = load i32, ptr addrspace(7) %y
|
|
// ```
|
|
// we cannot (except through some code that runs during scheduling) determine
|
|
// that the rewritten loads below do not alias.
|
|
// ```
|
|
// %y.off = add i32 %x.off, 1
|
|
// %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32
|
|
// %x.off, ...)
|
|
// %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8)
|
|
// %x.rsrc, i32 %y.off, ...)
|
|
// ```
|
|
// However, existing alias information is preserved.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "SIDefines.h"
|
|
#include "llvm/ADT/SetOperations.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/Analysis/InstSimplifyFolder.h"
|
|
#include "llvm/Analysis/Utils/Local.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/AttributeMask.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/DebugInfo.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstIterator.h"
|
|
#include "llvm/IR/InstVisitor.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/Metadata.h"
|
|
#include "llvm/IR/Operator.h"
|
|
#include "llvm/IR/PatternMatch.h"
|
|
#include "llvm/IR/ReplaceConstant.h"
|
|
#include "llvm/IR/ValueHandle.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/AMDGPUAddrSpace.h"
|
|
#include "llvm/Support/Alignment.h"
|
|
#include "llvm/Support/AtomicOrdering.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Transforms/Utils/Cloning.h"
|
|
#include "llvm/Transforms/Utils/Local.h"
|
|
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
|
|
#include "llvm/Transforms/Utils/ValueMapper.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers"
|
|
|
|
using namespace llvm;
|
|
|
|
static constexpr unsigned BufferOffsetWidth = 32;
|
|
|
|
namespace {
|
|
/// Recursively replace instances of ptr addrspace(7) and vector<Nxptr
|
|
/// addrspace(7)> with some other type as defined by the relevant subclass.
|
|
class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper {
|
|
DenseMap<Type *, Type *> Map;
|
|
|
|
Type *remapTypeImpl(Type *Ty);
|
|
|
|
protected:
|
|
virtual Type *remapScalar(PointerType *PT) = 0;
|
|
virtual Type *remapVector(VectorType *VT) = 0;
|
|
|
|
const DataLayout &DL;
|
|
|
|
public:
|
|
BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {}
|
|
Type *remapType(Type *SrcTy) override;
|
|
void clear() { Map.clear(); }
|
|
};
|
|
|
|
/// Remap ptr addrspace(7) to i160 and vector<Nxptr addrspace(7)> to
|
|
/// vector<Nxi60> in order to correctly handling loading/storing these values
|
|
/// from memory.
|
|
class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase {
|
|
using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase;
|
|
|
|
protected:
|
|
Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); }
|
|
Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); }
|
|
};
|
|
|
|
/// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset
|
|
/// parts of the pointer) so that we can easily rewrite operations on these
|
|
/// values that aren't loading them from or storing them to memory.
|
|
class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase {
|
|
using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase;
|
|
|
|
protected:
|
|
Type *remapScalar(PointerType *PT) override;
|
|
Type *remapVector(VectorType *VT) override;
|
|
};
|
|
} // namespace
|
|
|
|
// This code is adapted from the type remapper in lib/Linker/IRMover.cpp
|
|
Type *BufferFatPtrTypeLoweringBase::remapTypeImpl(Type *Ty) {
|
|
Type **Entry = &Map[Ty];
|
|
if (*Entry)
|
|
return *Entry;
|
|
if (auto *PT = dyn_cast<PointerType>(Ty)) {
|
|
if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) {
|
|
return *Entry = remapScalar(PT);
|
|
}
|
|
}
|
|
if (auto *VT = dyn_cast<VectorType>(Ty)) {
|
|
auto *PT = dyn_cast<PointerType>(VT->getElementType());
|
|
if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) {
|
|
return *Entry = remapVector(VT);
|
|
}
|
|
return *Entry = Ty;
|
|
}
|
|
// Whether the type is one that is structurally uniqued - that is, if it is
|
|
// not a named struct (the only kind of type where multiple structurally
|
|
// identical types that have a distinct `Type*`)
|
|
StructType *TyAsStruct = dyn_cast<StructType>(Ty);
|
|
bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral();
|
|
// Base case for ints, floats, opaque pointers, and so on, which don't
|
|
// require recursion.
|
|
if (Ty->getNumContainedTypes() == 0 && IsUniqued)
|
|
return *Entry = Ty;
|
|
bool Changed = false;
|
|
SmallVector<Type *> ElementTypes(Ty->getNumContainedTypes(), nullptr);
|
|
for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) {
|
|
Type *OldElem = Ty->getContainedType(I);
|
|
Type *NewElem = remapTypeImpl(OldElem);
|
|
ElementTypes[I] = NewElem;
|
|
Changed |= (OldElem != NewElem);
|
|
}
|
|
// Recursive calls to remapTypeImpl() may have invalidated pointer.
|
|
Entry = &Map[Ty];
|
|
if (!Changed) {
|
|
return *Entry = Ty;
|
|
}
|
|
if (auto *ArrTy = dyn_cast<ArrayType>(Ty))
|
|
return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements());
|
|
if (auto *FnTy = dyn_cast<FunctionType>(Ty))
|
|
return *Entry = FunctionType::get(ElementTypes[0],
|
|
ArrayRef(ElementTypes).slice(1),
|
|
FnTy->isVarArg());
|
|
if (auto *STy = dyn_cast<StructType>(Ty)) {
|
|
// Genuine opaque types don't have a remapping.
|
|
if (STy->isOpaque())
|
|
return *Entry = Ty;
|
|
bool IsPacked = STy->isPacked();
|
|
if (IsUniqued)
|
|
return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
|
|
SmallString<16> Name(STy->getName());
|
|
STy->setName("");
|
|
return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name,
|
|
IsPacked);
|
|
}
|
|
llvm_unreachable("Unknown type of type that contains elements");
|
|
}
|
|
|
|
Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) {
|
|
return remapTypeImpl(SrcTy);
|
|
}
|
|
|
|
Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) {
|
|
LLVMContext &Ctx = PT->getContext();
|
|
return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE),
|
|
IntegerType::get(Ctx, BufferOffsetWidth));
|
|
}
|
|
|
|
Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) {
|
|
ElementCount EC = VT->getElementCount();
|
|
LLVMContext &Ctx = VT->getContext();
|
|
Type *RsrcVec =
|
|
VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC);
|
|
Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC);
|
|
return StructType::get(RsrcVec, OffVec);
|
|
}
|
|
|
|
static bool isBufferFatPtrOrVector(Type *Ty) {
|
|
if (auto *PT = dyn_cast<PointerType>(Ty->getScalarType()))
|
|
return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER;
|
|
return false;
|
|
}
|
|
|
|
// True if the type is {ptr addrspace(8), i32} or a struct containing vectors of
|
|
// those types. Used to quickly skip instructions we don't need to process.
|
|
static bool isSplitFatPtr(Type *Ty) {
|
|
auto *ST = dyn_cast<StructType>(Ty);
|
|
if (!ST)
|
|
return false;
|
|
if (!ST->isLiteral() || ST->getNumElements() != 2)
|
|
return false;
|
|
auto *MaybeRsrc =
|
|
dyn_cast<PointerType>(ST->getElementType(0)->getScalarType());
|
|
auto *MaybeOff =
|
|
dyn_cast<IntegerType>(ST->getElementType(1)->getScalarType());
|
|
return MaybeRsrc && MaybeOff &&
|
|
MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE &&
|
|
MaybeOff->getBitWidth() == BufferOffsetWidth;
|
|
}
|
|
|
|
// True if the result type or any argument types are buffer fat pointers.
|
|
static bool isBufferFatPtrConst(Constant *C) {
|
|
Type *T = C->getType();
|
|
return isBufferFatPtrOrVector(T) || any_of(C->operands(), [](const Use &U) {
|
|
return isBufferFatPtrOrVector(U.get()->getType());
|
|
});
|
|
}
|
|
|
|
namespace {
|
|
/// Convert [vectors of] buffer fat pointers to integers when they are read from
|
|
/// or stored to memory. This ensures that these pointers will have the same
|
|
/// memory layout as before they are lowered, even though they will no longer
|
|
/// have their previous layout in registers/in the program (they'll be broken
|
|
/// down into resource and offset parts). This has the downside of imposing
|
|
/// marshalling costs when reading or storing these values, but since placing
|
|
/// such pointers into memory is an uncommon operation at best, we feel that
|
|
/// this cost is acceptable for better performance in the common case.
|
|
class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
|
|
: public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor, bool> {
|
|
BufferFatPtrToIntTypeMap *TypeMap;
|
|
|
|
ValueToValueMapTy ConvertedForStore;
|
|
|
|
IRBuilder<InstSimplifyFolder> IRB;
|
|
|
|
const TargetMachine *TM;
|
|
|
|
// Convert all the buffer fat pointers within the input value to inttegers
|
|
// so that it can be stored in memory.
|
|
Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name);
|
|
// Convert all the i160s that need to be buffer fat pointers (as specified)
|
|
// by the To type) into those pointers to preserve the semantics of the rest
|
|
// of the program.
|
|
Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name);
|
|
|
|
public:
|
|
StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
|
|
const DataLayout &DL,
|
|
LLVMContext &Ctx,
|
|
const TargetMachine *TM)
|
|
: TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {}
|
|
bool processFunction(Function &F);
|
|
|
|
bool visitInstruction(Instruction &I) { return false; }
|
|
bool visitAllocaInst(AllocaInst &I);
|
|
bool visitLoadInst(LoadInst &LI);
|
|
bool visitStoreInst(StoreInst &SI);
|
|
bool visitGetElementPtrInst(GetElementPtrInst &I);
|
|
|
|
bool visitMemCpyInst(MemCpyInst &MCI);
|
|
bool visitMemMoveInst(MemMoveInst &MMI);
|
|
bool visitMemSetInst(MemSetInst &MSI);
|
|
bool visitMemSetPatternInst(MemSetPatternInst &MSPI);
|
|
};
|
|
} // namespace
|
|
|
|
Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts(
|
|
Value *V, Type *From, Type *To, const Twine &Name) {
|
|
if (From == To)
|
|
return V;
|
|
ValueToValueMapTy::iterator Find = ConvertedForStore.find(V);
|
|
if (Find != ConvertedForStore.end())
|
|
return Find->second;
|
|
if (isBufferFatPtrOrVector(From)) {
|
|
Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int");
|
|
ConvertedForStore[V] = Cast;
|
|
return Cast;
|
|
}
|
|
if (From->getNumContainedTypes() == 0)
|
|
return V;
|
|
// Structs, arrays, and other compound types.
|
|
Value *Ret = PoisonValue::get(To);
|
|
if (auto *AT = dyn_cast<ArrayType>(From)) {
|
|
Type *FromPart = AT->getArrayElementType();
|
|
Type *ToPart = cast<ArrayType>(To)->getElementType();
|
|
for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) {
|
|
Value *Field = IRB.CreateExtractValue(V, I);
|
|
Value *NewField =
|
|
fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I));
|
|
Ret = IRB.CreateInsertValue(Ret, NewField, I);
|
|
}
|
|
} else {
|
|
for (auto [Idx, FromPart, ToPart] :
|
|
enumerate(From->subtypes(), To->subtypes())) {
|
|
Value *Field = IRB.CreateExtractValue(V, Idx);
|
|
Value *NewField =
|
|
fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx));
|
|
Ret = IRB.CreateInsertValue(Ret, NewField, Idx);
|
|
}
|
|
}
|
|
ConvertedForStore[V] = Ret;
|
|
return Ret;
|
|
}
|
|
|
|
Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs(
|
|
Value *V, Type *From, Type *To, const Twine &Name) {
|
|
if (From == To)
|
|
return V;
|
|
if (isBufferFatPtrOrVector(To)) {
|
|
Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr");
|
|
return Cast;
|
|
}
|
|
if (From->getNumContainedTypes() == 0)
|
|
return V;
|
|
// Structs, arrays, and other compound types.
|
|
Value *Ret = PoisonValue::get(To);
|
|
if (auto *AT = dyn_cast<ArrayType>(From)) {
|
|
Type *FromPart = AT->getArrayElementType();
|
|
Type *ToPart = cast<ArrayType>(To)->getElementType();
|
|
for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) {
|
|
Value *Field = IRB.CreateExtractValue(V, I);
|
|
Value *NewField =
|
|
intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I));
|
|
Ret = IRB.CreateInsertValue(Ret, NewField, I);
|
|
}
|
|
} else {
|
|
for (auto [Idx, FromPart, ToPart] :
|
|
enumerate(From->subtypes(), To->subtypes())) {
|
|
Value *Field = IRB.CreateExtractValue(V, Idx);
|
|
Value *NewField =
|
|
intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx));
|
|
Ret = IRB.CreateInsertValue(Ret, NewField, Idx);
|
|
}
|
|
}
|
|
return Ret;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) {
|
|
bool Changed = false;
|
|
// Process memcpy-like instructions after the main iteration because they can
|
|
// invalidate iterators.
|
|
SmallVector<WeakTrackingVH> CanBecomeLoops;
|
|
for (Instruction &I : make_early_inc_range(instructions(F))) {
|
|
if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
|
|
CanBecomeLoops.push_back(&I);
|
|
else
|
|
Changed |= visit(I);
|
|
}
|
|
for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) {
|
|
Changed |= visit(cast<Instruction>(VH));
|
|
}
|
|
ConvertedForStore.clear();
|
|
return Changed;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) {
|
|
Type *Ty = I.getAllocatedType();
|
|
Type *NewTy = TypeMap->remapType(Ty);
|
|
if (Ty == NewTy)
|
|
return false;
|
|
I.setAllocatedType(NewTy);
|
|
return true;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst(
|
|
GetElementPtrInst &I) {
|
|
Type *Ty = I.getSourceElementType();
|
|
Type *NewTy = TypeMap->remapType(Ty);
|
|
if (Ty == NewTy)
|
|
return false;
|
|
// We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so
|
|
// make sure GEPs don't have different semantics with the new type.
|
|
I.setSourceElementType(NewTy);
|
|
I.setResultElementType(TypeMap->remapType(I.getResultElementType()));
|
|
return true;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) {
|
|
Type *Ty = LI.getType();
|
|
Type *IntTy = TypeMap->remapType(Ty);
|
|
if (Ty == IntTy)
|
|
return false;
|
|
|
|
IRB.SetInsertPoint(&LI);
|
|
auto *NLI = cast<LoadInst>(LI.clone());
|
|
NLI->mutateType(IntTy);
|
|
NLI = IRB.Insert(NLI);
|
|
NLI->takeName(&LI);
|
|
|
|
Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName());
|
|
LI.replaceAllUsesWith(CastBack);
|
|
LI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
|
|
Value *V = SI.getValueOperand();
|
|
Type *Ty = V->getType();
|
|
Type *IntTy = TypeMap->remapType(Ty);
|
|
if (Ty == IntTy)
|
|
return false;
|
|
|
|
IRB.SetInsertPoint(&SI);
|
|
Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName());
|
|
for (auto *Dbg : at::getAssignmentMarkers(&SI))
|
|
Dbg->setValue(IntV);
|
|
|
|
SI.setOperand(0, IntV);
|
|
return true;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst(
|
|
MemCpyInst &MCI) {
|
|
// TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
|
|
// need loop expansion here.
|
|
if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
|
|
MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
|
|
return false;
|
|
llvm::expandMemCpyAsLoop(&MCI,
|
|
TM->getTargetTransformInfo(*MCI.getFunction()));
|
|
MCI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst(
|
|
MemMoveInst &MMI) {
|
|
if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
|
|
MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
|
|
return false;
|
|
report_fatal_error(
|
|
"memmove() on buffer descriptors is not implemented because pointer "
|
|
"comparison on buffer descriptors isn't implemented\n");
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
|
|
MemSetInst &MSI) {
|
|
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
|
|
return false;
|
|
llvm::expandMemSetAsLoop(&MSI);
|
|
MSI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
|
|
MemSetPatternInst &MSPI) {
|
|
if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
|
|
return false;
|
|
llvm::expandMemSetPatternAsLoop(&MSPI);
|
|
MSPI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
namespace {
|
|
/// Convert loads/stores of types that the buffer intrinsics can't handle into
|
|
/// one ore more such loads/stores that consist of legal types.
|
|
///
|
|
/// Do this by
|
|
/// 1. Recursing into structs (and arrays that don't share a memory layout with
|
|
/// vectors) since the intrinsics can't handle complex types.
|
|
/// 2. Converting arrays of non-aggregate, byte-sized types into their
|
|
/// corresponding vectors
|
|
/// 3. Bitcasting unsupported types, namely overly-long scalars and byte
|
|
/// vectors, into vectors of supported types.
|
|
/// 4. Splitting up excessively long reads/writes into multiple operations.
|
|
///
|
|
/// Note that this doesn't handle complex data strucures, but, in the future,
|
|
/// the aggregate load splitter from SROA could be refactored to allow for that
|
|
/// case.
|
|
class LegalizeBufferContentTypesVisitor
|
|
: public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
|
|
friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
|
|
|
|
IRBuilder<InstSimplifyFolder> IRB;
|
|
|
|
const DataLayout &DL;
|
|
|
|
/// If T is [N x U], where U is a scalar type, return the vector type
|
|
/// <N x U>, otherwise, return T.
|
|
Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
|
|
Value *arrayToVector(Value *V, Type *TargetType, const Twine &Name);
|
|
Value *vectorToArray(Value *V, Type *OrigType, const Twine &Name);
|
|
|
|
/// Break up the loads of a struct into the loads of its components
|
|
|
|
/// Convert a vector or scalar type that can't be operated on by buffer
|
|
/// intrinsics to one that would be legal through bitcasts and/or truncation.
|
|
/// Uses the wider of i32, i16, or i8 where possible.
|
|
Type *legalNonAggregateFor(Type *T);
|
|
Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
|
|
Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
|
|
|
|
struct VecSlice {
|
|
uint64_t Index = 0;
|
|
uint64_t Length = 0;
|
|
VecSlice() = delete;
|
|
// Needed for some Clangs
|
|
VecSlice(uint64_t Index, uint64_t Length) : Index(Index), Length(Length) {}
|
|
};
|
|
/// Return the [index, length] pairs into which `T` needs to be cut to form
|
|
/// legal buffer load or store operations. Clears `Slices`. Creates an empty
|
|
/// `Slices` for non-vector inputs and creates one slice if no slicing will be
|
|
/// needed.
|
|
void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
|
|
|
|
Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
|
|
Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
|
|
|
|
/// In most cases, return `LegalType`. However, when given an input that would
|
|
/// normally be a legal type for the buffer intrinsics to return but that
|
|
/// isn't hooked up through SelectionDAG, return a type of the same width that
|
|
/// can be used with the relevant intrinsics. Specifically, handle the cases:
|
|
/// - <1 x T> => T for all T
|
|
/// - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
|
|
/// - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
|
|
/// i32>
|
|
Type *intrinsicTypeFor(Type *LegalType);
|
|
|
|
bool visitLoadImpl(LoadInst &OrigLI, Type *PartType,
|
|
SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset,
|
|
Value *&Result, const Twine &Name);
|
|
/// Return value is (Changed, ModifiedInPlace)
|
|
std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType,
|
|
SmallVectorImpl<uint32_t> &AggIdxs,
|
|
uint64_t AggByteOffset,
|
|
const Twine &Name);
|
|
|
|
bool visitInstruction(Instruction &I) { return false; }
|
|
bool visitLoadInst(LoadInst &LI);
|
|
bool visitStoreInst(StoreInst &SI);
|
|
|
|
public:
|
|
LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
|
|
: IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
|
|
bool processFunction(Function &F);
|
|
};
|
|
} // namespace
|
|
|
|
Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
|
|
ArrayType *AT = dyn_cast<ArrayType>(T);
|
|
if (!AT)
|
|
return T;
|
|
Type *ET = AT->getElementType();
|
|
if (!ET->isSingleValueType() || isa<VectorType>(ET))
|
|
report_fatal_error("loading non-scalar arrays from buffer fat pointers "
|
|
"should have recursed");
|
|
if (!DL.typeSizeEqualsStoreSize(AT))
|
|
report_fatal_error(
|
|
"loading padded arrays from buffer fat pinters should have recursed");
|
|
return FixedVectorType::get(ET, AT->getNumElements());
|
|
}
|
|
|
|
Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
|
|
Type *TargetType,
|
|
const Twine &Name) {
|
|
Value *VectorRes = PoisonValue::get(TargetType);
|
|
auto *VT = cast<FixedVectorType>(TargetType);
|
|
unsigned EC = VT->getNumElements();
|
|
for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
|
|
Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I));
|
|
VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I,
|
|
Name + ".as.vec." + Twine(I));
|
|
}
|
|
return VectorRes;
|
|
}
|
|
|
|
Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
|
|
Type *OrigType,
|
|
const Twine &Name) {
|
|
Value *ArrayRes = PoisonValue::get(OrigType);
|
|
ArrayType *AT = cast<ArrayType>(OrigType);
|
|
unsigned EC = AT->getNumElements();
|
|
for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
|
|
Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I));
|
|
ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I,
|
|
Name + ".as.array." + Twine(I));
|
|
}
|
|
return ArrayRes;
|
|
}
|
|
|
|
Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
|
|
TypeSize Size = DL.getTypeStoreSizeInBits(T);
|
|
// Implicitly zero-extend to the next byte if needed
|
|
if (!DL.typeSizeEqualsStoreSize(T))
|
|
T = IRB.getIntNTy(Size.getFixedValue());
|
|
Type *ElemTy = T->getScalarType();
|
|
if (isa<PointerType, ScalableVectorType>(ElemTy)) {
|
|
// Pointers are always big enough, and we'll let scalable vectors through to
|
|
// fail in codegen.
|
|
return T;
|
|
}
|
|
unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
|
|
if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
|
|
// [vectors of] anything that's 16/32/64/128 bits can be cast and split into
|
|
// legal buffer operations.
|
|
return T;
|
|
}
|
|
Type *BestVectorElemType = nullptr;
|
|
if (Size.isKnownMultipleOf(32))
|
|
BestVectorElemType = IRB.getInt32Ty();
|
|
else if (Size.isKnownMultipleOf(16))
|
|
BestVectorElemType = IRB.getInt16Ty();
|
|
else
|
|
BestVectorElemType = IRB.getInt8Ty();
|
|
unsigned NumCastElems =
|
|
Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth();
|
|
if (NumCastElems == 1)
|
|
return BestVectorElemType;
|
|
return FixedVectorType::get(BestVectorElemType, NumCastElems);
|
|
}
|
|
|
|
Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
|
|
Value *V, Type *TargetType, const Twine &Name) {
|
|
Type *SourceType = V->getType();
|
|
TypeSize SourceSize = DL.getTypeSizeInBits(SourceType);
|
|
TypeSize TargetSize = DL.getTypeSizeInBits(TargetType);
|
|
if (SourceSize != TargetSize) {
|
|
Type *ShortScalarTy = IRB.getIntNTy(SourceSize.getFixedValue());
|
|
Type *ByteScalarTy = IRB.getIntNTy(TargetSize.getFixedValue());
|
|
Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
|
|
Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
|
|
V = Zext;
|
|
SourceType = ByteScalarTy;
|
|
}
|
|
return IRB.CreateBitCast(V, TargetType, Name + ".legal");
|
|
}
|
|
|
|
Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
|
|
Value *V, Type *OrigType, const Twine &Name) {
|
|
Type *LegalType = V->getType();
|
|
TypeSize LegalSize = DL.getTypeSizeInBits(LegalType);
|
|
TypeSize OrigSize = DL.getTypeSizeInBits(OrigType);
|
|
if (LegalSize != OrigSize) {
|
|
Type *ShortScalarTy = IRB.getIntNTy(OrigSize.getFixedValue());
|
|
Type *ByteScalarTy = IRB.getIntNTy(LegalSize.getFixedValue());
|
|
Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
|
|
Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
|
|
return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
|
|
}
|
|
return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
|
|
}
|
|
|
|
Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
|
|
auto *VT = dyn_cast<FixedVectorType>(LegalType);
|
|
if (!VT)
|
|
return LegalType;
|
|
Type *ET = VT->getElementType();
|
|
// Explicitly return the element type of 1-element vectors because the
|
|
// underlying intrinsics don't like <1 x T> even though it's a synonym for T.
|
|
if (VT->getNumElements() == 1)
|
|
return ET;
|
|
if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
|
|
return FixedVectorType::get(IRB.getInt32Ty(), 3);
|
|
if (ET->isIntegerTy(8)) {
|
|
switch (VT->getNumElements()) {
|
|
default:
|
|
return LegalType; // Let it crash later
|
|
case 1:
|
|
return IRB.getInt8Ty();
|
|
case 2:
|
|
return IRB.getInt16Ty();
|
|
case 4:
|
|
return IRB.getInt32Ty();
|
|
case 8:
|
|
return FixedVectorType::get(IRB.getInt32Ty(), 2);
|
|
case 16:
|
|
return FixedVectorType::get(IRB.getInt32Ty(), 4);
|
|
}
|
|
}
|
|
return LegalType;
|
|
}
|
|
|
|
void LegalizeBufferContentTypesVisitor::getVecSlices(
|
|
Type *T, SmallVectorImpl<VecSlice> &Slices) {
|
|
Slices.clear();
|
|
auto *VT = dyn_cast<FixedVectorType>(T);
|
|
if (!VT)
|
|
return;
|
|
|
|
uint64_t ElemBitWidth =
|
|
DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
|
|
|
|
uint64_t ElemsPer4Words = 128 / ElemBitWidth;
|
|
uint64_t ElemsPer2Words = ElemsPer4Words / 2;
|
|
uint64_t ElemsPerWord = ElemsPer2Words / 2;
|
|
uint64_t ElemsPerShort = ElemsPerWord / 2;
|
|
uint64_t ElemsPerByte = ElemsPerShort / 2;
|
|
// If the elements evenly pack into 32-bit words, we can use 3-word stores,
|
|
// such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
|
|
// example, <3 x i64>, since that's not slicing.
|
|
uint64_t ElemsPer3Words = ElemsPerWord * 3;
|
|
|
|
uint64_t TotalElems = VT->getNumElements();
|
|
uint64_t Index = 0;
|
|
auto TrySlice = [&](unsigned MaybeLen) {
|
|
if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
|
|
VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
|
|
Slices.push_back(Slice);
|
|
Index += MaybeLen;
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
while (Index < TotalElems) {
|
|
TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
|
|
TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
|
|
TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
|
|
}
|
|
}
|
|
|
|
Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
|
|
const Twine &Name) {
|
|
auto *VecVT = dyn_cast<FixedVectorType>(Vec->getType());
|
|
if (!VecVT)
|
|
return Vec;
|
|
if (S.Length == VecVT->getNumElements() && S.Index == 0)
|
|
return Vec;
|
|
if (S.Length == 1)
|
|
return IRB.CreateExtractElement(Vec, S.Index,
|
|
Name + ".slice." + Twine(S.Index));
|
|
SmallVector<int> Mask = llvm::to_vector(
|
|
llvm::iota_range<int>(S.Index, S.Index + S.Length, /*Inclusive=*/false));
|
|
return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Index));
|
|
}
|
|
|
|
Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
|
|
VecSlice S,
|
|
const Twine &Name) {
|
|
auto *WholeVT = dyn_cast<FixedVectorType>(Whole->getType());
|
|
if (!WholeVT)
|
|
return Part;
|
|
if (S.Length == WholeVT->getNumElements() && S.Index == 0)
|
|
return Part;
|
|
if (S.Length == 1) {
|
|
return IRB.CreateInsertElement(Whole, Part, S.Index,
|
|
Name + ".slice." + Twine(S.Index));
|
|
}
|
|
int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
|
|
|
|
// Extend the slice with poisons to make the main shufflevector happy.
|
|
SmallVector<int> ExtPartMask(NumElems, -1);
|
|
for (auto [I, E] : llvm::enumerate(
|
|
MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) {
|
|
E = I;
|
|
}
|
|
Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
|
|
Name + ".ext." + Twine(S.Index));
|
|
|
|
SmallVector<int> Mask =
|
|
llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
|
|
for (auto [I, E] :
|
|
llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Index, S.Length)))
|
|
E = I + NumElems;
|
|
return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
|
|
Name + ".parts." + Twine(S.Index));
|
|
}
|
|
|
|
bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
|
|
LoadInst &OrigLI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
|
|
uint64_t AggByteOff, Value *&Result, const Twine &Name) {
|
|
if (auto *ST = dyn_cast<StructType>(PartType)) {
|
|
const StructLayout *Layout = DL.getStructLayout(ST);
|
|
bool Changed = false;
|
|
for (auto [I, ElemTy, Offset] :
|
|
llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
|
|
AggIdxs.push_back(I);
|
|
Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
|
|
AggByteOff + Offset.getFixedValue(), Result,
|
|
Name + "." + Twine(I));
|
|
AggIdxs.pop_back();
|
|
}
|
|
return Changed;
|
|
}
|
|
if (auto *AT = dyn_cast<ArrayType>(PartType)) {
|
|
Type *ElemTy = AT->getElementType();
|
|
if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
|
|
ElemTy->isVectorTy()) {
|
|
TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
|
|
bool Changed = false;
|
|
for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
|
|
/*Inclusive=*/false)) {
|
|
AggIdxs.push_back(I);
|
|
Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
|
|
AggByteOff + I * ElemStoreSize.getFixedValue(),
|
|
Result, Name + Twine(I));
|
|
AggIdxs.pop_back();
|
|
}
|
|
return Changed;
|
|
}
|
|
}
|
|
|
|
// Typical case
|
|
|
|
Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
|
|
Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
|
|
|
|
SmallVector<VecSlice> Slices;
|
|
getVecSlices(LegalType, Slices);
|
|
bool HasSlices = Slices.size() > 1;
|
|
bool IsAggPart = !AggIdxs.empty();
|
|
Value *LoadsRes;
|
|
if (!HasSlices && !IsAggPart) {
|
|
Type *LoadableType = intrinsicTypeFor(LegalType);
|
|
if (LoadableType == PartType)
|
|
return false;
|
|
|
|
IRB.SetInsertPoint(&OrigLI);
|
|
auto *NLI = cast<LoadInst>(OrigLI.clone());
|
|
NLI->mutateType(LoadableType);
|
|
NLI = IRB.Insert(NLI);
|
|
NLI->setName(Name + ".loadable");
|
|
|
|
LoadsRes = IRB.CreateBitCast(NLI, LegalType, Name + ".from.loadable");
|
|
} else {
|
|
IRB.SetInsertPoint(&OrigLI);
|
|
LoadsRes = PoisonValue::get(LegalType);
|
|
Value *OrigPtr = OrigLI.getPointerOperand();
|
|
// If we're needing to spill something into more than one load, its legal
|
|
// type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
|
|
// But if we're already a scalar (which can happen if we're splitting up a
|
|
// struct), the element type will be the legal type itself.
|
|
Type *ElemType = LegalType->getScalarType();
|
|
unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
|
|
AAMDNodes AANodes = OrigLI.getAAMetadata();
|
|
if (IsAggPart && Slices.empty())
|
|
Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
|
|
for (VecSlice S : Slices) {
|
|
Type *SliceType =
|
|
S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
|
|
int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
|
|
// You can't reasonably expect loads to wrap around the edge of memory.
|
|
Value *NewPtr = IRB.CreateGEP(
|
|
IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
|
|
OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
|
|
GEPNoWrapFlags::noUnsignedWrap());
|
|
Type *LoadableType = intrinsicTypeFor(SliceType);
|
|
LoadInst *NewLI = IRB.CreateAlignedLoad(
|
|
LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
|
|
Name + ".off." + Twine(ByteOffset));
|
|
copyMetadataForLoad(*NewLI, OrigLI);
|
|
NewLI->setAAMetadata(
|
|
AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
|
|
NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
|
|
NewLI->setVolatile(OrigLI.isVolatile());
|
|
Value *Loaded = IRB.CreateBitCast(NewLI, SliceType,
|
|
NewLI->getName() + ".from.loadable");
|
|
LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
|
|
}
|
|
}
|
|
if (LegalType != ArrayAsVecType)
|
|
LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
|
|
if (ArrayAsVecType != PartType)
|
|
LoadsRes = vectorToArray(LoadsRes, PartType, Name);
|
|
|
|
if (IsAggPart)
|
|
Result = IRB.CreateInsertValue(Result, LoadsRes, AggIdxs, Name);
|
|
else
|
|
Result = LoadsRes;
|
|
return true;
|
|
}
|
|
|
|
bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
|
|
if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
|
|
return false;
|
|
|
|
SmallVector<uint32_t> AggIdxs;
|
|
Type *OrigType = LI.getType();
|
|
Value *Result = PoisonValue::get(OrigType);
|
|
bool Changed = visitLoadImpl(LI, OrigType, AggIdxs, 0, Result, LI.getName());
|
|
if (!Changed)
|
|
return false;
|
|
Result->takeName(&LI);
|
|
LI.replaceAllUsesWith(Result);
|
|
LI.eraseFromParent();
|
|
return Changed;
|
|
}
|
|
|
|
std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
|
|
StoreInst &OrigSI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
|
|
uint64_t AggByteOff, const Twine &Name) {
|
|
if (auto *ST = dyn_cast<StructType>(PartType)) {
|
|
const StructLayout *Layout = DL.getStructLayout(ST);
|
|
bool Changed = false;
|
|
for (auto [I, ElemTy, Offset] :
|
|
llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
|
|
AggIdxs.push_back(I);
|
|
Changed |= std::get<0>(visitStoreImpl(OrigSI, ElemTy, AggIdxs,
|
|
AggByteOff + Offset.getFixedValue(),
|
|
Name + "." + Twine(I)));
|
|
AggIdxs.pop_back();
|
|
}
|
|
return std::make_pair(Changed, /*ModifiedInPlace=*/false);
|
|
}
|
|
if (auto *AT = dyn_cast<ArrayType>(PartType)) {
|
|
Type *ElemTy = AT->getElementType();
|
|
if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
|
|
ElemTy->isVectorTy()) {
|
|
TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
|
|
bool Changed = false;
|
|
for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
|
|
/*Inclusive=*/false)) {
|
|
AggIdxs.push_back(I);
|
|
Changed |= std::get<0>(visitStoreImpl(
|
|
OrigSI, ElemTy, AggIdxs,
|
|
AggByteOff + I * ElemStoreSize.getFixedValue(), Name + Twine(I)));
|
|
AggIdxs.pop_back();
|
|
}
|
|
return std::make_pair(Changed, /*ModifiedInPlace=*/false);
|
|
}
|
|
}
|
|
|
|
Value *OrigData = OrigSI.getValueOperand();
|
|
Value *NewData = OrigData;
|
|
|
|
bool IsAggPart = !AggIdxs.empty();
|
|
if (IsAggPart)
|
|
NewData = IRB.CreateExtractValue(NewData, AggIdxs, Name);
|
|
|
|
Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
|
|
if (ArrayAsVecType != PartType) {
|
|
NewData = arrayToVector(NewData, ArrayAsVecType, Name);
|
|
}
|
|
|
|
Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
|
|
if (LegalType != ArrayAsVecType) {
|
|
NewData = makeLegalNonAggregate(NewData, LegalType, Name);
|
|
}
|
|
|
|
SmallVector<VecSlice> Slices;
|
|
getVecSlices(LegalType, Slices);
|
|
bool NeedToSplit = Slices.size() > 1 || IsAggPart;
|
|
if (!NeedToSplit) {
|
|
Type *StorableType = intrinsicTypeFor(LegalType);
|
|
if (StorableType == PartType)
|
|
return std::make_pair(/*Changed=*/false, /*ModifiedInPlace=*/false);
|
|
NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
|
|
OrigSI.setOperand(0, NewData);
|
|
return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/true);
|
|
}
|
|
|
|
Value *OrigPtr = OrigSI.getPointerOperand();
|
|
Type *ElemType = LegalType->getScalarType();
|
|
if (IsAggPart && Slices.empty())
|
|
Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
|
|
unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
|
|
AAMDNodes AANodes = OrigSI.getAAMetadata();
|
|
for (VecSlice S : Slices) {
|
|
Type *SliceType =
|
|
S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
|
|
int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
|
|
Value *NewPtr =
|
|
IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
|
|
OrigPtr->getName() + ".part." + Twine(S.Index),
|
|
GEPNoWrapFlags::noUnsignedWrap());
|
|
Value *DataSlice = extractSlice(NewData, S, Name);
|
|
Type *StorableType = intrinsicTypeFor(SliceType);
|
|
DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
|
|
DataSlice->getName() + ".storable");
|
|
auto *NewSI = cast<StoreInst>(OrigSI.clone());
|
|
NewSI->setAlignment(commonAlignment(OrigSI.getAlign(), ByteOffset));
|
|
IRB.Insert(NewSI);
|
|
NewSI->setOperand(0, DataSlice);
|
|
NewSI->setOperand(1, NewPtr);
|
|
NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
|
|
}
|
|
return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/false);
|
|
}
|
|
|
|
bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
|
|
if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
|
|
return false;
|
|
IRB.SetInsertPoint(&SI);
|
|
SmallVector<uint32_t> AggIdxs;
|
|
Value *OrigData = SI.getValueOperand();
|
|
auto [Changed, ModifiedInPlace] =
|
|
visitStoreImpl(SI, OrigData->getType(), AggIdxs, 0, OrigData->getName());
|
|
if (Changed && !ModifiedInPlace)
|
|
SI.eraseFromParent();
|
|
return Changed;
|
|
}
|
|
|
|
bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
|
|
bool Changed = false;
|
|
// Note, memory transfer intrinsics won't
|
|
for (Instruction &I : make_early_inc_range(instructions(F))) {
|
|
Changed |= visit(I);
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
/// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
|
|
/// buffer fat pointer constant.
|
|
static std::pair<Constant *, Constant *>
|
|
splitLoweredFatBufferConst(Constant *C) {
|
|
assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer");
|
|
return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u));
|
|
}
|
|
|
|
namespace {
|
|
/// Handle the remapping of ptr addrspace(7) constants.
|
|
class FatPtrConstMaterializer final : public ValueMaterializer {
|
|
BufferFatPtrToStructTypeMap *TypeMap;
|
|
// An internal mapper that is used to recurse into the arguments of constants.
|
|
// While the documentation for `ValueMapper` specifies not to use it
|
|
// recursively, examination of the logic in mapValue() shows that it can
|
|
// safely be used recursively when handling constants, like it does in its own
|
|
// logic.
|
|
ValueMapper InternalMapper;
|
|
|
|
Constant *materializeBufferFatPtrConst(Constant *C);
|
|
|
|
public:
|
|
// UnderlyingMap is the value map this materializer will be filling.
|
|
FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap,
|
|
ValueToValueMapTy &UnderlyingMap)
|
|
: TypeMap(TypeMap),
|
|
InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {}
|
|
virtual ~FatPtrConstMaterializer() = default;
|
|
|
|
Value *materialize(Value *V) override;
|
|
};
|
|
} // namespace
|
|
|
|
Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
|
|
Type *SrcTy = C->getType();
|
|
auto *NewTy = dyn_cast<StructType>(TypeMap->remapType(SrcTy));
|
|
if (C->isNullValue())
|
|
return ConstantAggregateZero::getNullValue(NewTy);
|
|
if (isa<PoisonValue>(C)) {
|
|
return ConstantStruct::get(NewTy,
|
|
{PoisonValue::get(NewTy->getElementType(0)),
|
|
PoisonValue::get(NewTy->getElementType(1))});
|
|
}
|
|
if (isa<UndefValue>(C)) {
|
|
return ConstantStruct::get(NewTy,
|
|
{UndefValue::get(NewTy->getElementType(0)),
|
|
UndefValue::get(NewTy->getElementType(1))});
|
|
}
|
|
|
|
if (auto *VC = dyn_cast<ConstantVector>(C)) {
|
|
if (Constant *S = VC->getSplatValue()) {
|
|
Constant *NewS = InternalMapper.mapConstant(*S);
|
|
if (!NewS)
|
|
return nullptr;
|
|
auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS);
|
|
auto EC = VC->getType()->getElementCount();
|
|
return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc),
|
|
ConstantVector::getSplat(EC, Off)});
|
|
}
|
|
SmallVector<Constant *> Rsrcs;
|
|
SmallVector<Constant *> Offs;
|
|
for (Value *Op : VC->operand_values()) {
|
|
auto *NewOp = dyn_cast_or_null<Constant>(InternalMapper.mapValue(*Op));
|
|
if (!NewOp)
|
|
return nullptr;
|
|
auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp);
|
|
Rsrcs.push_back(Rsrc);
|
|
Offs.push_back(Off);
|
|
}
|
|
Constant *RsrcVec = ConstantVector::get(Rsrcs);
|
|
Constant *OffVec = ConstantVector::get(Offs);
|
|
return ConstantStruct::get(NewTy, {RsrcVec, OffVec});
|
|
}
|
|
|
|
if (isa<GlobalValue>(C))
|
|
report_fatal_error("Global values containing ptr addrspace(7) (buffer "
|
|
"fat pointer) values are not supported");
|
|
|
|
if (isa<ConstantExpr>(C))
|
|
report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer "
|
|
"fat pointer) values should have been expanded earlier");
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
Value *FatPtrConstMaterializer::materialize(Value *V) {
|
|
Constant *C = dyn_cast<Constant>(V);
|
|
if (!C)
|
|
return nullptr;
|
|
// Structs and other types that happen to contain fat pointers get remapped
|
|
// by the mapValue() logic.
|
|
if (!isBufferFatPtrConst(C))
|
|
return nullptr;
|
|
return materializeBufferFatPtrConst(C);
|
|
}
|
|
|
|
using PtrParts = std::pair<Value *, Value *>;
|
|
namespace {
|
|
// The visitor returns the resource and offset parts for an instruction if they
|
|
// can be computed, or (nullptr, nullptr) for cases that don't have a meaningful
|
|
// value mapping.
|
|
class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
|
|
ValueToValueMapTy RsrcParts;
|
|
ValueToValueMapTy OffParts;
|
|
|
|
// Track instructions that have been rewritten into a user of the component
|
|
// parts of their ptr addrspace(7) input. Instructions that produced
|
|
// ptr addrspace(7) parts should **not** be RAUW'd before being added to this
|
|
// set, as that replacement will be handled in a post-visit step. However,
|
|
// instructions that yield values that aren't fat pointers (ex. ptrtoint)
|
|
// should RAUW themselves with new instructions that use the split parts
|
|
// of their arguments during processing.
|
|
DenseSet<Instruction *> SplitUsers;
|
|
|
|
// Nodes that need a second look once we've computed the parts for all other
|
|
// instructions to see if, for example, we really need to phi on the resource
|
|
// part.
|
|
SmallVector<Instruction *> Conditionals;
|
|
// Temporary instructions produced while lowering conditionals that should be
|
|
// killed.
|
|
SmallVector<Instruction *> ConditionalTemps;
|
|
|
|
// Subtarget info, needed for determining what cache control bits to set.
|
|
const TargetMachine *TM;
|
|
const GCNSubtarget *ST = nullptr;
|
|
|
|
IRBuilder<InstSimplifyFolder> IRB;
|
|
|
|
// Copy metadata between instructions if applicable.
|
|
void copyMetadata(Value *Dest, Value *Src);
|
|
|
|
// Get the resource and offset parts of the value V, inserting appropriate
|
|
// extractvalue calls if needed.
|
|
PtrParts getPtrParts(Value *V);
|
|
|
|
// Given an instruction that could produce multiple resource parts (a PHI or
|
|
// select), collect the set of possible instructions that could have provided
|
|
// its resource parts that it could have (the `Roots`) and the set of
|
|
// conditional instructions visited during the search (`Seen`). If, after
|
|
// removing the root of the search from `Seen` and `Roots`, `Seen` is a subset
|
|
// of `Roots` and `Roots - Seen` contains one element, the resource part of
|
|
// that element can replace the resource part of all other elements in `Seen`.
|
|
void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl<Value *> &Roots,
|
|
SmallPtrSetImpl<Value *> &Seen);
|
|
void processConditionals();
|
|
|
|
// If an instruction hav been split into resource and offset parts,
|
|
// delete that instruction. If any of its uses have not themselves been split
|
|
// into parts (for example, an insertvalue), construct the structure
|
|
// that the type rewrites declared should be produced by the dying instruction
|
|
// and use that.
|
|
// Also, kill the temporary extractvalue operations produced by the two-stage
|
|
// lowering of PHIs and conditionals.
|
|
void killAndReplaceSplitInstructions(SmallVectorImpl<Instruction *> &Origs);
|
|
|
|
void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx);
|
|
void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID);
|
|
void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID);
|
|
Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty,
|
|
Align Alignment, AtomicOrdering Order,
|
|
bool IsVolatile, SyncScope::ID SSID);
|
|
|
|
public:
|
|
SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx,
|
|
const TargetMachine *TM)
|
|
: TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {}
|
|
|
|
void processFunction(Function &F);
|
|
|
|
PtrParts visitInstruction(Instruction &I);
|
|
PtrParts visitLoadInst(LoadInst &LI);
|
|
PtrParts visitStoreInst(StoreInst &SI);
|
|
PtrParts visitAtomicRMWInst(AtomicRMWInst &AI);
|
|
PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI);
|
|
PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP);
|
|
|
|
PtrParts visitPtrToIntInst(PtrToIntInst &PI);
|
|
PtrParts visitIntToPtrInst(IntToPtrInst &IP);
|
|
PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I);
|
|
PtrParts visitICmpInst(ICmpInst &Cmp);
|
|
PtrParts visitFreezeInst(FreezeInst &I);
|
|
|
|
PtrParts visitExtractElementInst(ExtractElementInst &I);
|
|
PtrParts visitInsertElementInst(InsertElementInst &I);
|
|
PtrParts visitShuffleVectorInst(ShuffleVectorInst &I);
|
|
|
|
PtrParts visitPHINode(PHINode &PHI);
|
|
PtrParts visitSelectInst(SelectInst &SI);
|
|
|
|
PtrParts visitIntrinsicInst(IntrinsicInst &II);
|
|
};
|
|
} // namespace
|
|
|
|
void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) {
|
|
auto *DestI = dyn_cast<Instruction>(Dest);
|
|
auto *SrcI = dyn_cast<Instruction>(Src);
|
|
|
|
if (!DestI || !SrcI)
|
|
return;
|
|
|
|
DestI->copyMetadata(*SrcI);
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::getPtrParts(Value *V) {
|
|
assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts "
|
|
"of something that wasn't rewritten");
|
|
auto *RsrcEntry = &RsrcParts[V];
|
|
auto *OffEntry = &OffParts[V];
|
|
if (*RsrcEntry && *OffEntry)
|
|
return {*RsrcEntry, *OffEntry};
|
|
|
|
if (auto *C = dyn_cast<Constant>(V)) {
|
|
auto [Rsrc, Off] = splitLoweredFatBufferConst(C);
|
|
return {*RsrcEntry = Rsrc, *OffEntry = Off};
|
|
}
|
|
|
|
IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
|
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
|
LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n");
|
|
auto [Rsrc, Off] = visit(*I);
|
|
if (Rsrc && Off)
|
|
return {*RsrcEntry = Rsrc, *OffEntry = Off};
|
|
// We'll be creating the new values after the relevant instruction.
|
|
// This instruction generates a value and so isn't a terminator.
|
|
IRB.SetInsertPoint(*I->getInsertionPointAfterDef());
|
|
IRB.SetCurrentDebugLocation(I->getDebugLoc());
|
|
} else if (auto *A = dyn_cast<Argument>(V)) {
|
|
IRB.SetInsertPointPastAllocas(A->getParent());
|
|
IRB.SetCurrentDebugLocation(DebugLoc());
|
|
}
|
|
Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc");
|
|
Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off");
|
|
return {*RsrcEntry = Rsrc, *OffEntry = Off};
|
|
}
|
|
|
|
/// Returns the instruction that defines the resource part of the value V.
|
|
/// Note that this is not getUnderlyingObject(), since that looks through
|
|
/// operations like ptrmask which might modify the resource part.
|
|
///
|
|
/// We can limit ourselves to just looking through GEPs followed by looking
|
|
/// through addrspacecasts because only those two operations preserve the
|
|
/// resource part, and because operations on an `addrspace(8)` (which is the
|
|
/// legal input to this addrspacecast) would produce a different resource part.
|
|
static Value *rsrcPartRoot(Value *V) {
|
|
while (auto *GEP = dyn_cast<GEPOperator>(V))
|
|
V = GEP->getPointerOperand();
|
|
while (auto *ASC = dyn_cast<AddrSpaceCastOperator>(V))
|
|
V = ASC->getPointerOperand();
|
|
return V;
|
|
}
|
|
|
|
void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I,
|
|
SmallPtrSetImpl<Value *> &Roots,
|
|
SmallPtrSetImpl<Value *> &Seen) {
|
|
if (auto *PHI = dyn_cast<PHINode>(I)) {
|
|
if (!Seen.insert(I).second)
|
|
return;
|
|
for (Value *In : PHI->incoming_values()) {
|
|
In = rsrcPartRoot(In);
|
|
Roots.insert(In);
|
|
if (isa<PHINode, SelectInst>(In))
|
|
getPossibleRsrcRoots(cast<Instruction>(In), Roots, Seen);
|
|
}
|
|
} else if (auto *SI = dyn_cast<SelectInst>(I)) {
|
|
if (!Seen.insert(SI).second)
|
|
return;
|
|
Value *TrueVal = rsrcPartRoot(SI->getTrueValue());
|
|
Value *FalseVal = rsrcPartRoot(SI->getFalseValue());
|
|
Roots.insert(TrueVal);
|
|
Roots.insert(FalseVal);
|
|
if (isa<PHINode, SelectInst>(TrueVal))
|
|
getPossibleRsrcRoots(cast<Instruction>(TrueVal), Roots, Seen);
|
|
if (isa<PHINode, SelectInst>(FalseVal))
|
|
getPossibleRsrcRoots(cast<Instruction>(FalseVal), Roots, Seen);
|
|
} else {
|
|
llvm_unreachable("getPossibleRsrcParts() only works on phi and select");
|
|
}
|
|
}
|
|
|
|
void SplitPtrStructs::processConditionals() {
|
|
SmallDenseMap<Value *, Value *> FoundRsrcs;
|
|
SmallPtrSet<Value *, 4> Roots;
|
|
SmallPtrSet<Value *, 4> Seen;
|
|
for (Instruction *I : Conditionals) {
|
|
// These have to exist by now because we've visited these nodes.
|
|
Value *Rsrc = RsrcParts[I];
|
|
Value *Off = OffParts[I];
|
|
assert(Rsrc && Off && "must have visited conditionals by now");
|
|
|
|
std::optional<Value *> MaybeRsrc;
|
|
auto MaybeFoundRsrc = FoundRsrcs.find(I);
|
|
if (MaybeFoundRsrc != FoundRsrcs.end()) {
|
|
MaybeRsrc = MaybeFoundRsrc->second;
|
|
} else {
|
|
IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
|
|
Roots.clear();
|
|
Seen.clear();
|
|
getPossibleRsrcRoots(I, Roots, Seen);
|
|
LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n");
|
|
#ifndef NDEBUG
|
|
for (Value *V : Roots)
|
|
LLVM_DEBUG(dbgs() << "Root: " << *V << "\n");
|
|
for (Value *V : Seen)
|
|
LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n");
|
|
#endif
|
|
// If we are our own possible root, then we shouldn't block our
|
|
// replacement with a valid incoming value.
|
|
Roots.erase(I);
|
|
// We don't want to block the optimization for conditionals that don't
|
|
// refer to themselves but did see themselves during the traversal.
|
|
Seen.erase(I);
|
|
|
|
if (set_is_subset(Seen, Roots)) {
|
|
auto Diff = set_difference(Roots, Seen);
|
|
if (Diff.size() == 1) {
|
|
Value *RootVal = *Diff.begin();
|
|
// Handle the case where previous loops already looked through
|
|
// an addrspacecast.
|
|
if (isSplitFatPtr(RootVal->getType()))
|
|
MaybeRsrc = std::get<0>(getPtrParts(RootVal));
|
|
else
|
|
MaybeRsrc = RootVal;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (auto *PHI = dyn_cast<PHINode>(I)) {
|
|
Value *NewRsrc;
|
|
StructType *PHITy = cast<StructType>(PHI->getType());
|
|
IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef());
|
|
IRB.SetCurrentDebugLocation(PHI->getDebugLoc());
|
|
if (MaybeRsrc) {
|
|
NewRsrc = *MaybeRsrc;
|
|
} else {
|
|
Type *RsrcTy = PHITy->getElementType(0);
|
|
auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues());
|
|
RsrcPHI->takeName(Rsrc);
|
|
for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) {
|
|
Value *VRsrc = std::get<0>(getPtrParts(V));
|
|
RsrcPHI->addIncoming(VRsrc, BB);
|
|
}
|
|
copyMetadata(RsrcPHI, PHI);
|
|
NewRsrc = RsrcPHI;
|
|
}
|
|
|
|
Type *OffTy = PHITy->getElementType(1);
|
|
auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues());
|
|
NewOff->takeName(Off);
|
|
for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) {
|
|
assert(OffParts.count(V) && "An offset part had to be created by now");
|
|
Value *VOff = std::get<1>(getPtrParts(V));
|
|
NewOff->addIncoming(VOff, BB);
|
|
}
|
|
copyMetadata(NewOff, PHI);
|
|
|
|
// Note: We don't eraseFromParent() the temporaries because we don't want
|
|
// to put the corrections maps in an inconstent state. That'll be handed
|
|
// during the rest of the killing. Also, `ValueToValueMapTy` guarantees
|
|
// that references in that map will be updated as well.
|
|
// Note that if the temporary instruction got `InstSimplify`'d away, it
|
|
// might be something like a block argument.
|
|
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
|
|
ConditionalTemps.push_back(RsrcInst);
|
|
RsrcInst->replaceAllUsesWith(NewRsrc);
|
|
}
|
|
if (auto *OffInst = dyn_cast<Instruction>(Off)) {
|
|
ConditionalTemps.push_back(OffInst);
|
|
OffInst->replaceAllUsesWith(NewOff);
|
|
}
|
|
|
|
// Save on recomputing the cycle traversals in known-root cases.
|
|
if (MaybeRsrc)
|
|
for (Value *V : Seen)
|
|
FoundRsrcs[V] = NewRsrc;
|
|
} else if (isa<SelectInst>(I)) {
|
|
if (MaybeRsrc) {
|
|
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
|
|
ConditionalTemps.push_back(RsrcInst);
|
|
RsrcInst->replaceAllUsesWith(*MaybeRsrc);
|
|
}
|
|
for (Value *V : Seen)
|
|
FoundRsrcs[V] = *MaybeRsrc;
|
|
}
|
|
} else {
|
|
llvm_unreachable("Only PHIs and selects go in the conditionals list");
|
|
}
|
|
}
|
|
}
|
|
|
|
void SplitPtrStructs::killAndReplaceSplitInstructions(
|
|
SmallVectorImpl<Instruction *> &Origs) {
|
|
for (Instruction *I : ConditionalTemps)
|
|
I->eraseFromParent();
|
|
|
|
for (Instruction *I : Origs) {
|
|
if (!SplitUsers.contains(I))
|
|
continue;
|
|
|
|
SmallVector<DbgValueInst *> Dbgs;
|
|
findDbgValues(Dbgs, I);
|
|
for (auto *Dbg : Dbgs) {
|
|
IRB.SetInsertPoint(Dbg);
|
|
auto &DL = I->getDataLayout();
|
|
assert(isSplitFatPtr(I->getType()) &&
|
|
"We should've RAUW'd away loads, stores, etc. at this point");
|
|
auto *OffDbg = cast<DbgValueInst>(Dbg->clone());
|
|
copyMetadata(OffDbg, Dbg);
|
|
auto [Rsrc, Off] = getPtrParts(I);
|
|
|
|
int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType());
|
|
int64_t OffSz = DL.getTypeSizeInBits(Off->getType());
|
|
|
|
std::optional<DIExpression *> RsrcExpr =
|
|
DIExpression::createFragmentExpression(Dbg->getExpression(), 0,
|
|
RsrcSz);
|
|
std::optional<DIExpression *> OffExpr =
|
|
DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz,
|
|
OffSz);
|
|
if (OffExpr) {
|
|
OffDbg->setExpression(*OffExpr);
|
|
OffDbg->replaceVariableLocationOp(I, Off);
|
|
IRB.Insert(OffDbg);
|
|
} else {
|
|
OffDbg->deleteValue();
|
|
}
|
|
if (RsrcExpr) {
|
|
Dbg->setExpression(*RsrcExpr);
|
|
Dbg->replaceVariableLocationOp(I, Rsrc);
|
|
} else {
|
|
Dbg->replaceVariableLocationOp(I, PoisonValue::get(I->getType()));
|
|
}
|
|
}
|
|
|
|
Value *Poison = PoisonValue::get(I->getType());
|
|
I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool {
|
|
if (const auto *UI = dyn_cast<Instruction>(U.getUser()))
|
|
return SplitUsers.contains(UI);
|
|
return false;
|
|
});
|
|
|
|
if (I->use_empty()) {
|
|
I->eraseFromParent();
|
|
continue;
|
|
}
|
|
IRB.SetInsertPoint(*I->getInsertionPointAfterDef());
|
|
IRB.SetCurrentDebugLocation(I->getDebugLoc());
|
|
auto [Rsrc, Off] = getPtrParts(I);
|
|
Value *Struct = PoisonValue::get(I->getType());
|
|
Struct = IRB.CreateInsertValue(Struct, Rsrc, 0);
|
|
Struct = IRB.CreateInsertValue(Struct, Off, 1);
|
|
copyMetadata(Struct, I);
|
|
Struct->takeName(I);
|
|
I->replaceAllUsesWith(Struct);
|
|
I->eraseFromParent();
|
|
}
|
|
}
|
|
|
|
void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) {
|
|
LLVMContext &Ctx = Intr->getContext();
|
|
Intr->addParamAttr(RsrcArgIdx, Attribute::getWithAlignment(Ctx, A));
|
|
}
|
|
|
|
void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order,
|
|
SyncScope::ID SSID) {
|
|
switch (Order) {
|
|
case AtomicOrdering::Release:
|
|
case AtomicOrdering::AcquireRelease:
|
|
case AtomicOrdering::SequentiallyConsistent:
|
|
IRB.CreateFence(AtomicOrdering::Release, SSID);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order,
|
|
SyncScope::ID SSID) {
|
|
switch (Order) {
|
|
case AtomicOrdering::Acquire:
|
|
case AtomicOrdering::AcquireRelease:
|
|
case AtomicOrdering::SequentiallyConsistent:
|
|
IRB.CreateFence(AtomicOrdering::Acquire, SSID);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
|
|
Type *Ty, Align Alignment,
|
|
AtomicOrdering Order, bool IsVolatile,
|
|
SyncScope::ID SSID) {
|
|
IRB.SetInsertPoint(I);
|
|
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
SmallVector<Value *, 5> Args;
|
|
if (Arg)
|
|
Args.push_back(Arg);
|
|
Args.push_back(Rsrc);
|
|
Args.push_back(Off);
|
|
insertPreMemOpFence(Order, SSID);
|
|
// soffset is always 0 for these cases, where we always want any offset to be
|
|
// part of bounds checking and we don't know which parts of the GEPs is
|
|
// uniform.
|
|
Args.push_back(IRB.getInt32(0));
|
|
|
|
uint32_t Aux = 0;
|
|
if (IsVolatile)
|
|
Aux |= AMDGPU::CPol::VOLATILE;
|
|
Args.push_back(IRB.getInt32(Aux));
|
|
|
|
Intrinsic::ID IID = Intrinsic::not_intrinsic;
|
|
if (isa<LoadInst>(I))
|
|
IID = Order == AtomicOrdering::NotAtomic
|
|
? Intrinsic::amdgcn_raw_ptr_buffer_load
|
|
: Intrinsic::amdgcn_raw_ptr_atomic_buffer_load;
|
|
else if (isa<StoreInst>(I))
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_store;
|
|
else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) {
|
|
switch (RMW->getOperation()) {
|
|
case AtomicRMWInst::Xchg:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap;
|
|
break;
|
|
case AtomicRMWInst::Add:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add;
|
|
break;
|
|
case AtomicRMWInst::Sub:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub;
|
|
break;
|
|
case AtomicRMWInst::And:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and;
|
|
break;
|
|
case AtomicRMWInst::Or:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or;
|
|
break;
|
|
case AtomicRMWInst::Xor:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor;
|
|
break;
|
|
case AtomicRMWInst::Max:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax;
|
|
break;
|
|
case AtomicRMWInst::Min:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin;
|
|
break;
|
|
case AtomicRMWInst::UMax:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax;
|
|
break;
|
|
case AtomicRMWInst::UMin:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin;
|
|
break;
|
|
case AtomicRMWInst::FAdd:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd;
|
|
break;
|
|
case AtomicRMWInst::FMax:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax;
|
|
break;
|
|
case AtomicRMWInst::FMin:
|
|
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
|
|
break;
|
|
case AtomicRMWInst::FSub: {
|
|
report_fatal_error("atomic floating point subtraction not supported for "
|
|
"buffer resources and should've been expanded away");
|
|
break;
|
|
}
|
|
case AtomicRMWInst::FMaximum: {
|
|
report_fatal_error("atomic floating point fmaximum not supported for "
|
|
"buffer resources and should've been expanded away");
|
|
break;
|
|
}
|
|
case AtomicRMWInst::FMinimum: {
|
|
report_fatal_error("atomic floating point fminimum not supported for "
|
|
"buffer resources and should've been expanded away");
|
|
break;
|
|
}
|
|
case AtomicRMWInst::Nand:
|
|
report_fatal_error("atomic nand not supported for buffer resources and "
|
|
"should've been expanded away");
|
|
break;
|
|
case AtomicRMWInst::UIncWrap:
|
|
case AtomicRMWInst::UDecWrap:
|
|
report_fatal_error("wrapping increment/decrement not supported for "
|
|
"buffer resources and should've ben expanded away");
|
|
break;
|
|
case AtomicRMWInst::BAD_BINOP:
|
|
llvm_unreachable("Not sure how we got a bad binop");
|
|
case AtomicRMWInst::USubCond:
|
|
case AtomicRMWInst::USubSat:
|
|
break;
|
|
}
|
|
}
|
|
|
|
auto *Call = IRB.CreateIntrinsic(IID, Ty, Args);
|
|
copyMetadata(Call, I);
|
|
setAlign(Call, Alignment, Arg ? 1 : 0);
|
|
Call->takeName(I);
|
|
|
|
insertPostMemOpFence(Order, SSID);
|
|
// The "no moving p7 directly" rewrites ensure that this load or store won't
|
|
// itself need to be split into parts.
|
|
SplitUsers.insert(I);
|
|
I->replaceAllUsesWith(Call);
|
|
return Call;
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitInstruction(Instruction &I) {
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) {
|
|
if (!isSplitFatPtr(LI.getPointerOperandType()))
|
|
return {nullptr, nullptr};
|
|
handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(),
|
|
LI.getAlign(), LI.getOrdering(), LI.isVolatile(),
|
|
LI.getSyncScopeID());
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) {
|
|
if (!isSplitFatPtr(SI.getPointerOperandType()))
|
|
return {nullptr, nullptr};
|
|
Value *Arg = SI.getValueOperand();
|
|
handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(),
|
|
SI.getAlign(), SI.getOrdering(), SI.isVolatile(),
|
|
SI.getSyncScopeID());
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) {
|
|
if (!isSplitFatPtr(AI.getPointerOperand()->getType()))
|
|
return {nullptr, nullptr};
|
|
Value *Arg = AI.getValOperand();
|
|
handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(),
|
|
AI.getAlign(), AI.getOrdering(), AI.isVolatile(),
|
|
AI.getSyncScopeID());
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
// Unlike load, store, and RMW, cmpxchg needs special handling to account
|
|
// for the boolean argument.
|
|
PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) {
|
|
Value *Ptr = AI.getPointerOperand();
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&AI);
|
|
|
|
Type *Ty = AI.getNewValOperand()->getType();
|
|
AtomicOrdering Order = AI.getMergedOrdering();
|
|
SyncScope::ID SSID = AI.getSyncScopeID();
|
|
bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal);
|
|
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
insertPreMemOpFence(Order, SSID);
|
|
|
|
uint32_t Aux = 0;
|
|
if (IsNonTemporal)
|
|
Aux |= AMDGPU::CPol::SLC;
|
|
if (AI.isVolatile())
|
|
Aux |= AMDGPU::CPol::VOLATILE;
|
|
auto *Call =
|
|
IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty,
|
|
{AI.getNewValOperand(), AI.getCompareOperand(), Rsrc,
|
|
Off, IRB.getInt32(0), IRB.getInt32(Aux)});
|
|
copyMetadata(Call, &AI);
|
|
setAlign(Call, AI.getAlign(), 2);
|
|
Call->takeName(&AI);
|
|
insertPostMemOpFence(Order, SSID);
|
|
|
|
Value *Res = PoisonValue::get(AI.getType());
|
|
Res = IRB.CreateInsertValue(Res, Call, 0);
|
|
if (!AI.isWeak()) {
|
|
Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand());
|
|
Res = IRB.CreateInsertValue(Res, Succeeded, 1);
|
|
}
|
|
SplitUsers.insert(&AI);
|
|
AI.replaceAllUsesWith(Res);
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
|
|
using namespace llvm::PatternMatch;
|
|
Value *Ptr = GEP.getPointerOperand();
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&GEP);
|
|
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
const DataLayout &DL = GEP.getDataLayout();
|
|
bool IsNUW = GEP.hasNoUnsignedWrap();
|
|
bool IsNUSW = GEP.hasNoUnsignedSignedWrap();
|
|
|
|
StructType *ResTy = cast<StructType>(GEP.getType());
|
|
Type *ResRsrcTy = ResTy->getElementType(0);
|
|
VectorType *ResRsrcVecTy = dyn_cast<VectorType>(ResRsrcTy);
|
|
bool BroadcastsPtr = ResRsrcVecTy && !isa<VectorType>(Off->getType());
|
|
|
|
// In order to call emitGEPOffset() and thus not have to reimplement it,
|
|
// we need the GEP result to have ptr addrspace(7) type.
|
|
Type *FatPtrTy =
|
|
ResRsrcTy->getWithNewType(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER));
|
|
GEP.mutateType(FatPtrTy);
|
|
Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP);
|
|
GEP.mutateType(ResTy);
|
|
|
|
if (BroadcastsPtr) {
|
|
Rsrc = IRB.CreateVectorSplat(ResRsrcVecTy->getElementCount(), Rsrc,
|
|
Rsrc->getName());
|
|
Off = IRB.CreateVectorSplat(ResRsrcVecTy->getElementCount(), Off,
|
|
Off->getName());
|
|
}
|
|
if (match(OffAccum, m_Zero())) { // Constant-zero offset
|
|
SplitUsers.insert(&GEP);
|
|
return {Rsrc, Off};
|
|
}
|
|
|
|
bool HasNonNegativeOff = false;
|
|
if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) {
|
|
HasNonNegativeOff = !CI->isNegative();
|
|
}
|
|
Value *NewOff;
|
|
if (match(Off, m_Zero())) {
|
|
NewOff = OffAccum;
|
|
} else {
|
|
NewOff = IRB.CreateAdd(Off, OffAccum, "",
|
|
/*hasNUW=*/IsNUW || (IsNUSW && HasNonNegativeOff),
|
|
/*hasNSW=*/false);
|
|
}
|
|
copyMetadata(NewOff, &GEP);
|
|
NewOff->takeName(&GEP);
|
|
SplitUsers.insert(&GEP);
|
|
return {Rsrc, NewOff};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) {
|
|
Value *Ptr = PI.getPointerOperand();
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&PI);
|
|
|
|
Type *ResTy = PI.getType();
|
|
unsigned Width = ResTy->getScalarSizeInBits();
|
|
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
const DataLayout &DL = PI.getDataLayout();
|
|
unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER);
|
|
|
|
Value *Res;
|
|
if (Width <= BufferOffsetWidth) {
|
|
Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
|
|
PI.getName() + ".off");
|
|
} else {
|
|
Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc");
|
|
Value *Shl = IRB.CreateShl(
|
|
RsrcInt,
|
|
ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)),
|
|
"", Width >= FatPtrWidth, Width > FatPtrWidth);
|
|
Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false,
|
|
PI.getName() + ".off");
|
|
Res = IRB.CreateOr(Shl, OffCast);
|
|
}
|
|
|
|
copyMetadata(Res, &PI);
|
|
Res->takeName(&PI);
|
|
SplitUsers.insert(&PI);
|
|
PI.replaceAllUsesWith(Res);
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) {
|
|
if (!isSplitFatPtr(IP.getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&IP);
|
|
const DataLayout &DL = IP.getDataLayout();
|
|
unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE);
|
|
Value *Int = IP.getOperand(0);
|
|
Type *IntTy = Int->getType();
|
|
Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth);
|
|
unsigned Width = IntTy->getScalarSizeInBits();
|
|
|
|
auto *RetTy = cast<StructType>(IP.getType());
|
|
Type *RsrcTy = RetTy->getElementType(0);
|
|
Type *OffTy = RetTy->getElementType(1);
|
|
Value *RsrcPart = IRB.CreateLShr(
|
|
Int,
|
|
ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth)));
|
|
Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false);
|
|
Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc");
|
|
Value *Off =
|
|
IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off");
|
|
|
|
copyMetadata(Rsrc, &IP);
|
|
SplitUsers.insert(&IP);
|
|
return {Rsrc, Off};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
|
|
// TODO(krzysz00): handle casts from ptr addrspace(7) to global pointers
|
|
// by computing the effective address.
|
|
if (!isSplitFatPtr(I.getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
Value *In = I.getPointerOperand();
|
|
// No-op casts preserve parts
|
|
if (In->getType() == I.getType()) {
|
|
auto [Rsrc, Off] = getPtrParts(In);
|
|
SplitUsers.insert(&I);
|
|
return {Rsrc, Off};
|
|
}
|
|
|
|
auto *ResTy = cast<StructType>(I.getType());
|
|
Type *RsrcTy = ResTy->getElementType(0);
|
|
Type *OffTy = ResTy->getElementType(1);
|
|
Value *ZeroOff = Constant::getNullValue(OffTy);
|
|
|
|
// Special case for null pointers, undef, and poison, which can be created by
|
|
// address space propagation.
|
|
auto *InConst = dyn_cast<Constant>(In);
|
|
if (InConst && InConst->isNullValue()) {
|
|
Value *NullRsrc = Constant::getNullValue(RsrcTy);
|
|
SplitUsers.insert(&I);
|
|
return {NullRsrc, ZeroOff};
|
|
}
|
|
if (isa<PoisonValue>(In)) {
|
|
Value *PoisonRsrc = PoisonValue::get(RsrcTy);
|
|
Value *PoisonOff = PoisonValue::get(OffTy);
|
|
SplitUsers.insert(&I);
|
|
return {PoisonRsrc, PoisonOff};
|
|
}
|
|
if (isa<UndefValue>(In)) {
|
|
Value *UndefRsrc = UndefValue::get(RsrcTy);
|
|
Value *UndefOff = UndefValue::get(OffTy);
|
|
SplitUsers.insert(&I);
|
|
return {UndefRsrc, UndefOff};
|
|
}
|
|
|
|
if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE)
|
|
report_fatal_error(
|
|
"only buffer resources (addrspace 8) and null/poison pointers can be "
|
|
"cast to buffer fat pointers (addrspace 7)");
|
|
SplitUsers.insert(&I);
|
|
return {In, ZeroOff};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) {
|
|
Value *Lhs = Cmp.getOperand(0);
|
|
if (!isSplitFatPtr(Lhs->getType()))
|
|
return {nullptr, nullptr};
|
|
Value *Rhs = Cmp.getOperand(1);
|
|
IRB.SetInsertPoint(&Cmp);
|
|
ICmpInst::Predicate Pred = Cmp.getPredicate();
|
|
|
|
assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
|
|
"Pointer comparison is only equal or unequal");
|
|
auto [LhsRsrc, LhsOff] = getPtrParts(Lhs);
|
|
auto [RhsRsrc, RhsOff] = getPtrParts(Rhs);
|
|
Value *RsrcCmp =
|
|
IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc");
|
|
copyMetadata(RsrcCmp, &Cmp);
|
|
Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off");
|
|
copyMetadata(OffCmp, &Cmp);
|
|
|
|
Value *Res = nullptr;
|
|
if (Pred == ICmpInst::ICMP_EQ)
|
|
Res = IRB.CreateAnd(RsrcCmp, OffCmp);
|
|
else if (Pred == ICmpInst::ICMP_NE)
|
|
Res = IRB.CreateOr(RsrcCmp, OffCmp);
|
|
copyMetadata(Res, &Cmp);
|
|
Res->takeName(&Cmp);
|
|
SplitUsers.insert(&Cmp);
|
|
Cmp.replaceAllUsesWith(Res);
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) {
|
|
if (!isSplitFatPtr(I.getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
auto [Rsrc, Off] = getPtrParts(I.getOperand(0));
|
|
|
|
Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc");
|
|
copyMetadata(RsrcRes, &I);
|
|
Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off");
|
|
copyMetadata(OffRes, &I);
|
|
SplitUsers.insert(&I);
|
|
return {RsrcRes, OffRes};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) {
|
|
if (!isSplitFatPtr(I.getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
Value *Vec = I.getVectorOperand();
|
|
Value *Idx = I.getIndexOperand();
|
|
auto [Rsrc, Off] = getPtrParts(Vec);
|
|
|
|
Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc");
|
|
copyMetadata(RsrcRes, &I);
|
|
Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off");
|
|
copyMetadata(OffRes, &I);
|
|
SplitUsers.insert(&I);
|
|
return {RsrcRes, OffRes};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) {
|
|
// The mutated instructions temporarily don't return vectors, and so
|
|
// we need the generic getType() here to avoid crashes.
|
|
if (!isSplitFatPtr(cast<Instruction>(I).getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
Value *Vec = I.getOperand(0);
|
|
Value *Elem = I.getOperand(1);
|
|
Value *Idx = I.getOperand(2);
|
|
auto [VecRsrc, VecOff] = getPtrParts(Vec);
|
|
auto [ElemRsrc, ElemOff] = getPtrParts(Elem);
|
|
|
|
Value *RsrcRes =
|
|
IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc");
|
|
copyMetadata(RsrcRes, &I);
|
|
Value *OffRes =
|
|
IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off");
|
|
copyMetadata(OffRes, &I);
|
|
SplitUsers.insert(&I);
|
|
return {RsrcRes, OffRes};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) {
|
|
// Cast is needed for the same reason as insertelement's.
|
|
if (!isSplitFatPtr(cast<Instruction>(I).getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
|
|
Value *V1 = I.getOperand(0);
|
|
Value *V2 = I.getOperand(1);
|
|
ArrayRef<int> Mask = I.getShuffleMask();
|
|
auto [V1Rsrc, V1Off] = getPtrParts(V1);
|
|
auto [V2Rsrc, V2Off] = getPtrParts(V2);
|
|
|
|
Value *RsrcRes =
|
|
IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc");
|
|
copyMetadata(RsrcRes, &I);
|
|
Value *OffRes =
|
|
IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off");
|
|
copyMetadata(OffRes, &I);
|
|
SplitUsers.insert(&I);
|
|
return {RsrcRes, OffRes};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) {
|
|
if (!isSplitFatPtr(PHI.getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef());
|
|
// Phi nodes will be handled in post-processing after we've visited every
|
|
// instruction. However, instead of just returning {nullptr, nullptr},
|
|
// we explicitly create the temporary extractvalue operations that are our
|
|
// temporary results so that they end up at the beginning of the block with
|
|
// the PHIs.
|
|
Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc");
|
|
Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off");
|
|
Conditionals.push_back(&PHI);
|
|
SplitUsers.insert(&PHI);
|
|
return {TmpRsrc, TmpOff};
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) {
|
|
if (!isSplitFatPtr(SI.getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&SI);
|
|
|
|
Value *Cond = SI.getCondition();
|
|
Value *True = SI.getTrueValue();
|
|
Value *False = SI.getFalseValue();
|
|
auto [TrueRsrc, TrueOff] = getPtrParts(True);
|
|
auto [FalseRsrc, FalseOff] = getPtrParts(False);
|
|
|
|
Value *RsrcRes =
|
|
IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI);
|
|
copyMetadata(RsrcRes, &SI);
|
|
Conditionals.push_back(&SI);
|
|
Value *OffRes =
|
|
IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI);
|
|
copyMetadata(OffRes, &SI);
|
|
SplitUsers.insert(&SI);
|
|
return {RsrcRes, OffRes};
|
|
}
|
|
|
|
/// Returns true if this intrinsic needs to be removed when it is
|
|
/// applied to `ptr addrspace(7)` values. Calls to these intrinsics are
|
|
/// rewritten into calls to versions of that intrinsic on the resource
|
|
/// descriptor.
|
|
static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
|
|
switch (IID) {
|
|
default:
|
|
return false;
|
|
case Intrinsic::amdgcn_make_buffer_rsrc:
|
|
case Intrinsic::ptrmask:
|
|
case Intrinsic::invariant_start:
|
|
case Intrinsic::invariant_end:
|
|
case Intrinsic::launder_invariant_group:
|
|
case Intrinsic::strip_invariant_group:
|
|
case Intrinsic::memcpy:
|
|
case Intrinsic::memcpy_inline:
|
|
case Intrinsic::memmove:
|
|
case Intrinsic::memset:
|
|
case Intrinsic::memset_inline:
|
|
case Intrinsic::experimental_memset_pattern:
|
|
case Intrinsic::amdgcn_load_to_lds:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
|
|
Intrinsic::ID IID = I.getIntrinsicID();
|
|
switch (IID) {
|
|
default:
|
|
break;
|
|
case Intrinsic::amdgcn_make_buffer_rsrc: {
|
|
if (!isSplitFatPtr(I.getType()))
|
|
return {nullptr, nullptr};
|
|
Value *Base = I.getArgOperand(0);
|
|
Value *Stride = I.getArgOperand(1);
|
|
Value *NumRecords = I.getArgOperand(2);
|
|
Value *Flags = I.getArgOperand(3);
|
|
auto *SplitType = cast<StructType>(I.getType());
|
|
Type *RsrcType = SplitType->getElementType(0);
|
|
Type *OffType = SplitType->getElementType(1);
|
|
IRB.SetInsertPoint(&I);
|
|
Value *Rsrc = IRB.CreateIntrinsic(IID, {RsrcType, Base->getType()},
|
|
{Base, Stride, NumRecords, Flags});
|
|
copyMetadata(Rsrc, &I);
|
|
Rsrc->takeName(&I);
|
|
Value *Zero = Constant::getNullValue(OffType);
|
|
SplitUsers.insert(&I);
|
|
return {Rsrc, Zero};
|
|
}
|
|
case Intrinsic::ptrmask: {
|
|
Value *Ptr = I.getArgOperand(0);
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
Value *Mask = I.getArgOperand(1);
|
|
IRB.SetInsertPoint(&I);
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
if (Mask->getType() != Off->getType())
|
|
report_fatal_error("offset width is not equal to index width of fat "
|
|
"pointer (data layout not set up correctly?)");
|
|
Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off");
|
|
copyMetadata(OffRes, &I);
|
|
SplitUsers.insert(&I);
|
|
return {Rsrc, OffRes};
|
|
}
|
|
// Pointer annotation intrinsics that, given their object-wide nature
|
|
// operate on the resource part.
|
|
case Intrinsic::invariant_start: {
|
|
Value *Ptr = I.getArgOperand(1);
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE);
|
|
auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc});
|
|
copyMetadata(NewRsrc, &I);
|
|
NewRsrc->takeName(&I);
|
|
SplitUsers.insert(&I);
|
|
I.replaceAllUsesWith(NewRsrc);
|
|
return {nullptr, nullptr};
|
|
}
|
|
case Intrinsic::invariant_end: {
|
|
Value *RealPtr = I.getArgOperand(2);
|
|
if (!isSplitFatPtr(RealPtr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
Value *RealRsrc = getPtrParts(RealPtr).first;
|
|
Value *InvPtr = I.getArgOperand(0);
|
|
Value *Size = I.getArgOperand(1);
|
|
Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()},
|
|
{InvPtr, Size, RealRsrc});
|
|
copyMetadata(NewRsrc, &I);
|
|
NewRsrc->takeName(&I);
|
|
SplitUsers.insert(&I);
|
|
I.replaceAllUsesWith(NewRsrc);
|
|
return {nullptr, nullptr};
|
|
}
|
|
case Intrinsic::launder_invariant_group:
|
|
case Intrinsic::strip_invariant_group: {
|
|
Value *Ptr = I.getArgOperand(0);
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc});
|
|
copyMetadata(NewRsrc, &I);
|
|
NewRsrc->takeName(&I);
|
|
SplitUsers.insert(&I);
|
|
return {NewRsrc, Off};
|
|
}
|
|
case Intrinsic::amdgcn_load_to_lds: {
|
|
Value *Ptr = I.getArgOperand(0);
|
|
if (!isSplitFatPtr(Ptr->getType()))
|
|
return {nullptr, nullptr};
|
|
IRB.SetInsertPoint(&I);
|
|
auto [Rsrc, Off] = getPtrParts(Ptr);
|
|
Value *LDSPtr = I.getArgOperand(1);
|
|
Value *LoadSize = I.getArgOperand(2);
|
|
Value *ImmOff = I.getArgOperand(3);
|
|
Value *Aux = I.getArgOperand(4);
|
|
Value *SOffset = IRB.getInt32(0);
|
|
Instruction *NewLoad = IRB.CreateIntrinsic(
|
|
Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
|
|
{Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
|
|
copyMetadata(NewLoad, &I);
|
|
SplitUsers.insert(&I);
|
|
I.replaceAllUsesWith(NewLoad);
|
|
return {nullptr, nullptr};
|
|
}
|
|
}
|
|
return {nullptr, nullptr};
|
|
}
|
|
|
|
void SplitPtrStructs::processFunction(Function &F) {
|
|
ST = &TM->getSubtarget<GCNSubtarget>(F);
|
|
SmallVector<Instruction *, 0> Originals(
|
|
llvm::make_pointer_range(instructions(F)));
|
|
LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName()
|
|
<< "\n");
|
|
for (Instruction *I : Originals) {
|
|
auto [Rsrc, Off] = visit(I);
|
|
assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
|
|
"Can't have a resource but no offset");
|
|
if (Rsrc)
|
|
RsrcParts[I] = Rsrc;
|
|
if (Off)
|
|
OffParts[I] = Off;
|
|
}
|
|
processConditionals();
|
|
killAndReplaceSplitInstructions(Originals);
|
|
|
|
// Clean up after ourselves to save on memory.
|
|
RsrcParts.clear();
|
|
OffParts.clear();
|
|
SplitUsers.clear();
|
|
Conditionals.clear();
|
|
ConditionalTemps.clear();
|
|
}
|
|
|
|
namespace {
|
|
class AMDGPULowerBufferFatPointers : public ModulePass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPULowerBufferFatPointers() : ModulePass(ID) {}
|
|
|
|
bool run(Module &M, const TargetMachine &TM);
|
|
bool runOnModule(Module &M) override;
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
|
};
|
|
} // namespace
|
|
|
|
/// Returns true if there are values that have a buffer fat pointer in them,
|
|
/// which means we'll need to perform rewrites on this function. As a side
|
|
/// effect, this will populate the type remapping cache.
|
|
static bool containsBufferFatPointers(const Function &F,
|
|
BufferFatPtrToStructTypeMap *TypeMap) {
|
|
bool HasFatPointers = false;
|
|
for (const BasicBlock &BB : F)
|
|
for (const Instruction &I : BB)
|
|
HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
|
|
return HasFatPointers;
|
|
}
|
|
|
|
static bool hasFatPointerInterface(const Function &F,
|
|
BufferFatPtrToStructTypeMap *TypeMap) {
|
|
Type *Ty = F.getFunctionType();
|
|
return Ty != TypeMap->remapType(Ty);
|
|
}
|
|
|
|
/// Move the body of `OldF` into a new function, returning it.
|
|
static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy,
|
|
ValueToValueMapTy &CloneMap) {
|
|
bool IsIntrinsic = OldF->isIntrinsic();
|
|
Function *NewF =
|
|
Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace());
|
|
NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat;
|
|
NewF->copyAttributesFrom(OldF);
|
|
NewF->copyMetadata(OldF, 0);
|
|
NewF->takeName(OldF);
|
|
NewF->updateAfterNameChange();
|
|
NewF->setDLLStorageClass(OldF->getDLLStorageClass());
|
|
OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF);
|
|
|
|
while (!OldF->empty()) {
|
|
BasicBlock *BB = &OldF->front();
|
|
BB->removeFromParent();
|
|
BB->insertInto(NewF);
|
|
CloneMap[BB] = BB;
|
|
for (Instruction &I : *BB) {
|
|
CloneMap[&I] = &I;
|
|
}
|
|
}
|
|
|
|
SmallVector<AttributeSet> ArgAttrs;
|
|
AttributeList OldAttrs = OldF->getAttributes();
|
|
|
|
for (auto [I, OldArg, NewArg] : enumerate(OldF->args(), NewF->args())) {
|
|
CloneMap[&NewArg] = &OldArg;
|
|
NewArg.takeName(&OldArg);
|
|
Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType();
|
|
// Temporarily mutate type of `NewArg` to allow RAUW to work.
|
|
NewArg.mutateType(OldArgTy);
|
|
OldArg.replaceAllUsesWith(&NewArg);
|
|
NewArg.mutateType(NewArgTy);
|
|
|
|
AttributeSet ArgAttr = OldAttrs.getParamAttrs(I);
|
|
// Intrinsics get their attributes fixed later.
|
|
if (OldArgTy != NewArgTy && !IsIntrinsic)
|
|
ArgAttr = ArgAttr.removeAttributes(
|
|
NewF->getContext(),
|
|
AttributeFuncs::typeIncompatible(NewArgTy, ArgAttr));
|
|
ArgAttrs.push_back(ArgAttr);
|
|
}
|
|
AttributeSet RetAttrs = OldAttrs.getRetAttrs();
|
|
if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic)
|
|
RetAttrs = RetAttrs.removeAttributes(
|
|
NewF->getContext(),
|
|
AttributeFuncs::typeIncompatible(NewF->getReturnType(), RetAttrs));
|
|
NewF->setAttributes(AttributeList::get(
|
|
NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs));
|
|
return NewF;
|
|
}
|
|
|
|
static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) {
|
|
for (Argument &A : F->args())
|
|
CloneMap[&A] = &A;
|
|
for (BasicBlock &BB : *F) {
|
|
CloneMap[&BB] = &BB;
|
|
for (Instruction &I : BB)
|
|
CloneMap[&I] = &I;
|
|
}
|
|
}
|
|
|
|
bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
|
|
bool Changed = false;
|
|
const DataLayout &DL = M.getDataLayout();
|
|
// Record the functions which need to be remapped.
|
|
// The second element of the pair indicates whether the function has to have
|
|
// its arguments or return types adjusted.
|
|
SmallVector<std::pair<Function *, bool>> NeedsRemap;
|
|
|
|
BufferFatPtrToStructTypeMap StructTM(DL);
|
|
BufferFatPtrToIntTypeMap IntTM(DL);
|
|
for (const GlobalVariable &GV : M.globals()) {
|
|
if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER)
|
|
report_fatal_error("Global variables with a buffer fat pointer address "
|
|
"space (7) are not supported");
|
|
Type *VT = GV.getValueType();
|
|
if (VT != StructTM.remapType(VT))
|
|
report_fatal_error("Global variables that contain buffer fat pointers "
|
|
"(address space 7 pointers) are unsupported. Use "
|
|
"buffer resource pointers (address space 8) instead.");
|
|
}
|
|
|
|
{
|
|
// Collect all constant exprs and aggregates referenced by any function.
|
|
SmallVector<Constant *, 8> Worklist;
|
|
for (Function &F : M.functions())
|
|
for (Instruction &I : instructions(F))
|
|
for (Value *Op : I.operands())
|
|
if (isa<ConstantExpr, ConstantAggregate>(Op))
|
|
Worklist.push_back(cast<Constant>(Op));
|
|
|
|
// Recursively look for any referenced buffer pointer constants.
|
|
SmallPtrSet<Constant *, 8> Visited;
|
|
SetVector<Constant *> BufferFatPtrConsts;
|
|
while (!Worklist.empty()) {
|
|
Constant *C = Worklist.pop_back_val();
|
|
if (!Visited.insert(C).second)
|
|
continue;
|
|
if (isBufferFatPtrOrVector(C->getType()))
|
|
BufferFatPtrConsts.insert(C);
|
|
for (Value *Op : C->operands())
|
|
if (isa<ConstantExpr, ConstantAggregate>(Op))
|
|
Worklist.push_back(cast<Constant>(Op));
|
|
}
|
|
|
|
// Expand all constant expressions using fat buffer pointers to
|
|
// instructions.
|
|
Changed |= convertUsersOfConstantsToInstructions(
|
|
BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr,
|
|
/*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
|
|
}
|
|
|
|
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
|
|
M.getContext(), &TM);
|
|
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
|
|
M.getContext());
|
|
for (Function &F : M.functions()) {
|
|
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
|
|
bool BodyChanges = containsBufferFatPointers(F, &StructTM);
|
|
Changed |= MemOpsRewrite.processFunction(F);
|
|
if (InterfaceChange || BodyChanges) {
|
|
NeedsRemap.push_back(std::make_pair(&F, InterfaceChange));
|
|
Changed |= BufferContentsTypeRewrite.processFunction(F);
|
|
}
|
|
}
|
|
if (NeedsRemap.empty())
|
|
return Changed;
|
|
|
|
SmallVector<Function *> NeedsPostProcess;
|
|
SmallVector<Function *> Intrinsics;
|
|
// Keep one big map so as to memoize constants across functions.
|
|
ValueToValueMapTy CloneMap;
|
|
FatPtrConstMaterializer Materializer(&StructTM, CloneMap);
|
|
|
|
ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer);
|
|
for (auto [F, InterfaceChange] : NeedsRemap) {
|
|
Function *NewF = F;
|
|
if (InterfaceChange)
|
|
NewF = moveFunctionAdaptingType(
|
|
F, cast<FunctionType>(StructTM.remapType(F->getFunctionType())),
|
|
CloneMap);
|
|
else
|
|
makeCloneInPraceMap(F, CloneMap);
|
|
LowerInFuncs.remapFunction(*NewF);
|
|
if (NewF->isIntrinsic())
|
|
Intrinsics.push_back(NewF);
|
|
else
|
|
NeedsPostProcess.push_back(NewF);
|
|
if (InterfaceChange) {
|
|
F->replaceAllUsesWith(NewF);
|
|
F->eraseFromParent();
|
|
}
|
|
Changed = true;
|
|
}
|
|
StructTM.clear();
|
|
IntTM.clear();
|
|
CloneMap.clear();
|
|
|
|
SplitPtrStructs Splitter(DL, M.getContext(), &TM);
|
|
for (Function *F : NeedsPostProcess)
|
|
Splitter.processFunction(*F);
|
|
for (Function *F : Intrinsics) {
|
|
if (isRemovablePointerIntrinsic(F->getIntrinsicID())) {
|
|
F->eraseFromParent();
|
|
} else {
|
|
std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F);
|
|
if (NewF)
|
|
F->replaceAllUsesWith(*NewF);
|
|
}
|
|
}
|
|
return Changed;
|
|
}
|
|
|
|
bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) {
|
|
TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
|
|
const TargetMachine &TM = TPC.getTM<TargetMachine>();
|
|
return run(M, TM);
|
|
}
|
|
|
|
char AMDGPULowerBufferFatPointers::ID = 0;
|
|
|
|
char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID;
|
|
|
|
void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
AU.addRequired<TargetPassConfig>();
|
|
}
|
|
|
|
#define PASS_DESC "Lower buffer fat pointer operations to buffer resources"
|
|
INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC,
|
|
false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
|
INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false,
|
|
false)
|
|
#undef PASS_DESC
|
|
|
|
ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() {
|
|
return new AMDGPULowerBufferFatPointers();
|
|
}
|
|
|
|
PreservedAnalyses
|
|
AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) {
|
|
return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none()
|
|
: PreservedAnalyses::all();
|
|
}
|