llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Tony Tye 53eb469195 [AMDGPU] Support non-strictly stronger memory orderings in SIMemoryLegalizer
C++20 no longer requires the failure memory ordering to be no stronger than the
success memory ordering. Adjust assert in AMD GPU SIMemoryLegalizer, and merge
instruction memory orderings

Add common operation to merge memory orders that allows non strict memory
orderings to be combined. Use it in SIMemoryLegalizer and
MachineMemOperand::getMergedOrdering.

Reviewed By: efriedma, rampitec

Differential Revision: https://reviews.llvm.org/D106729
2021-08-10 08:43:03 +00:00

1864 lines
67 KiB
C++

//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Memory legalizer - implements memory model. More information can be
/// found here:
/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUMachineModuleInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
#define DEBUG_TYPE "si-memory-legalizer"
#define PASS_NAME "SI Memory Legalizer"
static cl::opt<bool> AmdgcnSkipCacheInvalidations(
"amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
cl::desc("Use this to skip inserting cache invalidating instructions."));
namespace {
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
/// Memory operation flags. Can be ORed together.
enum class SIMemOp {
NONE = 0u,
LOAD = 1u << 0,
STORE = 1u << 1,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
};
/// Position to insert a new instruction relative to an existing
/// instruction.
enum class Position {
BEFORE,
AFTER
};
/// The atomic synchronization scopes supported by the AMDGPU target.
enum class SIAtomicScope {
NONE,
SINGLETHREAD,
WAVEFRONT,
WORKGROUP,
AGENT,
SYSTEM
};
/// The distinct address spaces supported by the AMDGPU target for
/// atomic memory operation. Can be ORed toether.
enum class SIAtomicAddrSpace {
NONE = 0u,
GLOBAL = 1u << 0,
LDS = 1u << 1,
SCRATCH = 1u << 2,
GDS = 1u << 3,
OTHER = 1u << 4,
/// The address spaces that can be accessed by a FLAT instruction.
FLAT = GLOBAL | LDS | SCRATCH,
/// The address spaces that support atomic instructions.
ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
/// All address spaces.
ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
class SIMemOpInfo final {
private:
friend class SIMemOpAccess;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
SIAtomicScope Scope = SIAtomicScope::SYSTEM;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
bool IsVolatile = false;
bool IsNonTemporal = false;
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering =
AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false,
bool IsNonTemporal = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering),
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile),
IsNonTemporal(IsNonTemporal) {
if (Ordering == AtomicOrdering::NotAtomic) {
assert(Scope == SIAtomicScope::NONE &&
OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
!IsCrossAddressSpaceOrdering &&
FailureOrdering == AtomicOrdering::NotAtomic);
return;
}
assert(Scope != SIAtomicScope::NONE &&
(OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
SIAtomicAddrSpace::NONE &&
(InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
SIAtomicAddrSpace::NONE);
// There is also no cross address space ordering if the ordering
// address space is the same as the instruction address space and
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
this->IsCrossAddressSpaceOrdering = false;
// Limit the scope to the maximum supported by the instruction's address
// spaces.
if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
SIAtomicAddrSpace::NONE) {
this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
} else if ((InstrAddrSpace &
~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
SIAtomicAddrSpace::NONE) {
this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
} else if ((InstrAddrSpace &
~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
this->Scope = std::min(Scope, SIAtomicScope::AGENT);
}
}
public:
/// \returns Atomic synchronization scope of the machine instruction used to
/// create this SIMemOpInfo.
SIAtomicScope getScope() const {
return Scope;
}
/// \returns Ordering constraint of the machine instruction used to
/// create this SIMemOpInfo.
AtomicOrdering getOrdering() const {
return Ordering;
}
/// \returns Failure ordering constraint of the machine instruction used to
/// create this SIMemOpInfo.
AtomicOrdering getFailureOrdering() const {
return FailureOrdering;
}
/// \returns The address spaces be accessed by the machine
/// instruction used to create this SiMemOpInfo.
SIAtomicAddrSpace getInstrAddrSpace() const {
return InstrAddrSpace;
}
/// \returns The address spaces that must be ordered by the machine
/// instruction used to create this SiMemOpInfo.
SIAtomicAddrSpace getOrderingAddrSpace() const {
return OrderingAddrSpace;
}
/// \returns Return true iff memory ordering of operations on
/// different address spaces is required.
bool getIsCrossAddressSpaceOrdering() const {
return IsCrossAddressSpaceOrdering;
}
/// \returns True if memory access of the machine instruction used to
/// create this SIMemOpInfo is volatile, false otherwise.
bool isVolatile() const {
return IsVolatile;
}
/// \returns True if memory access of the machine instruction used to
/// create this SIMemOpInfo is nontemporal, false otherwise.
bool isNonTemporal() const {
return IsNonTemporal;
}
/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
bool isAtomic() const {
return Ordering != AtomicOrdering::NotAtomic;
}
};
class SIMemOpAccess final {
private:
AMDGPUMachineModuleInfo *MMI = nullptr;
/// Reports unsupported message \p Msg for \p MI to LLVM context.
void reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const;
/// Inspects the target synchronization scope \p SSID and determines
/// the SI atomic scope it corresponds to, the address spaces it
/// covers, and whether the memory ordering applies between address
/// spaces.
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
/// \return Return a bit set of the address spaces accessed by \p AS.
SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
/// \returns Info constructed from \p MI, which has at least machine memory
/// operand.
Optional<SIMemOpInfo> constructFromMIWithMMO(
const MachineBasicBlock::iterator &MI) const;
public:
/// Construct class to support accessing the machine memory operands
/// of instructions in the machine function \p MF.
SIMemOpAccess(MachineFunction &MF);
/// \returns Load info if \p MI is a load operation, "None" otherwise.
Optional<SIMemOpInfo> getLoadInfo(
const MachineBasicBlock::iterator &MI) const;
/// \returns Store info if \p MI is a store operation, "None" otherwise.
Optional<SIMemOpInfo> getStoreInfo(
const MachineBasicBlock::iterator &MI) const;
/// \returns Atomic fence info if \p MI is an atomic fence operation,
/// "None" otherwise.
Optional<SIMemOpInfo> getAtomicFenceInfo(
const MachineBasicBlock::iterator &MI) const;
/// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
/// rmw operation, "None" otherwise.
Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
const MachineBasicBlock::iterator &MI) const;
};
class SICacheControl {
protected:
/// AMDGPU subtarget info.
const GCNSubtarget &ST;
/// Instruction info.
const SIInstrInfo *TII = nullptr;
IsaVersion IV;
/// Whether to insert cache invalidating instructions.
bool InsertCacheInv;
SICacheControl(const GCNSubtarget &ST);
/// Sets named bit \p BitName to "true" if present in instruction \p MI.
/// \returns Returns true if \p MI is modified, false otherwise.
bool enableNamedBit(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Bit) const;
public:
/// Create a cache control for the subtarget \p ST.
static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
/// Update \p MI memory load instruction to bypass any caches up to
/// the \p Scope memory scope for address spaces \p
/// AddrSpace. Return true iff the instruction was modified.
virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const = 0;
/// Update \p MI memory store instruction to bypass any caches up to
/// the \p Scope memory scope for address spaces \p
/// AddrSpace. Return true iff the instruction was modified.
virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const = 0;
/// Update \p MI memory read-modify-write instruction to bypass any caches up
/// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
/// iff the instruction was modified.
virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const = 0;
/// Update \p MI memory instruction of kind \p Op associated with address
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
/// true iff the instruction was modified.
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op, bool IsVolatile,
bool IsNonTemporal) const = 0;
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
/// \p Op associated with address spaces \p AddrSpace have completed. Used
/// between memory instructions to enforce the order they become visible as
/// observed by other memory instructions executing in memory scope \p Scope.
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
/// address spaces. Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure any subsequent memory instructions of this
/// thread with address spaces \p AddrSpace will observe the previous memory
/// operations by any thread for memory scopes up to memory scope \p Scope .
/// Returns true iff any instructions inserted.
virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure previous memory instructions by this thread
/// with address spaces \p AddrSpace have completed and can be observed by
/// subsequent memory instructions by any thread executing in memory scope \p
/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
/// between address spaces. Returns true iff any instructions inserted.
virtual bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
class SIGfx6CacheControl : public SICacheControl {
protected:
/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit(MI, AMDGPU::CPol::GLC);
}
/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit(MI, AMDGPU::CPol::SLC);
}
public:
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
};
class SIGfx7CacheControl : public SIGfx6CacheControl {
public:
SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
};
class SIGfx90ACacheControl : public SIGfx7CacheControl {
public:
SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
};
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit(MI, AMDGPU::CPol::DLC);
}
public:
SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const override;
};
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
/// Cache Control.
std::unique_ptr<SICacheControl> CC = nullptr;
/// List of atomic pseudo instructions.
std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
/// Return true iff instruction \p MI is a atomic instruction that
/// returns a result.
bool isAtomicRet(const MachineInstr &MI) const {
return SIInstrInfo::isAtomicRet(MI);
}
/// Removes all processed atomic pseudo instructions from the current
/// function. Returns true if current function is modified, false otherwise.
bool removeAtomicPseudoMIs();
/// Expands load operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandLoad(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
/// Expands store operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandStore(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
/// Expands atomic fence operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
public:
static char ID;
SIMemoryLegalizer() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
StringRef getPassName() const override {
return PASS_NAME;
}
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // end namespace anonymous
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const {
const Function &Func = MI->getParent()->getParent()->getFunction();
DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
Func.getContext().diagnose(Diag);
}
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
SIAtomicAddrSpace InstrAddrSpace) const {
if (SSID == SyncScope::System)
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC,
true);
if (SSID == MMI->getAgentSSID())
return std::make_tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC,
true);
if (SSID == MMI->getWorkgroupSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC,
true);
if (SSID == MMI->getWavefrontSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
SIAtomicAddrSpace::ATOMIC,
true);
if (SSID == SyncScope::SingleThread)
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
SIAtomicAddrSpace::ATOMIC,
true);
if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
return None;
}
SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
if (AS == AMDGPUAS::FLAT_ADDRESS)
return SIAtomicAddrSpace::FLAT;
if (AS == AMDGPUAS::GLOBAL_ADDRESS)
return SIAtomicAddrSpace::GLOBAL;
if (AS == AMDGPUAS::LOCAL_ADDRESS)
return SIAtomicAddrSpace::LDS;
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
return SIAtomicAddrSpace::SCRATCH;
if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
return SIAtomicAddrSpace::OTHER;
}
SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
}
Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
const MachineBasicBlock::iterator &MI) const {
assert(MI->getNumMemOperands() > 0);
SyncScope::ID SSID = SyncScope::SingleThread;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsNonTemporal = true;
bool IsVolatile = false;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
for (const auto &MMO : MI->memoperands()) {
IsNonTemporal &= MMO->isNonTemporal();
IsVolatile |= MMO->isVolatile();
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
if (OpOrdering != AtomicOrdering::NotAtomic) {
const auto &IsSyncScopeInclusion =
MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
if (!IsSyncScopeInclusion) {
reportUnsupported(MI,
"Unsupported non-inclusive atomic synchronization scope");
return None;
}
SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
FailureOrdering =
getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
}
}
SIAtomicScope Scope = SIAtomicScope::NONE;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
if (Ordering != AtomicOrdering::NotAtomic) {
auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
if (!ScopeOrNone) {
reportUnsupported(MI, "Unsupported atomic synchronization scope");
return None;
}
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
ScopeOrNone.getValue();
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
reportUnsupported(MI, "Unsupported atomic address space");
return None;
}
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
IsNonTemporal);
}
Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(MI->mayLoad() && !MI->mayStore()))
return None;
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
return SIMemOpInfo();
return constructFromMIWithMMO(MI);
}
Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(!MI->mayLoad() && MI->mayStore()))
return None;
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
return SIMemOpInfo();
return constructFromMIWithMMO(MI);
}
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
return None;
AtomicOrdering Ordering =
static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
if (!ScopeOrNone) {
reportUnsupported(MI, "Unsupported atomic synchronization scope");
return None;
}
SIAtomicScope Scope = SIAtomicScope::NONE;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
ScopeOrNone.getValue();
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
reportUnsupported(MI, "Unsupported atomic address space");
return None;
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
if (!(MI->mayLoad() && MI->mayStore()))
return None;
// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
return SIMemOpInfo();
return constructFromMIWithMMO(MI);
}
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Bit) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
if (!CPol)
return false;
CPol->setImm(CPol->getImm() | Bit);
return true;
}
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (ST.hasGFX90AInsts())
return std::make_unique<SIGfx90ACacheControl>(ST);
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx7CacheControl>(ST);
return std::make_unique<SIGfx10CacheControl>(ST);
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to bypass.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory caches
/// to be bypassed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
return Changed;
}
bool SIGfx6CacheControl::enableStoreCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
/// The L1 cache is write through so does not need to be bypassed. There is no
/// bypass control for the L2 cache at the isa level.
return Changed;
}
bool SIGfx6CacheControl::enableRMWCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
/// The L1 cache is write through so does not need to be bypassed. There is no
/// bypass control for the L2 cache at the isa level.
return Changed;
}
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
bool Changed = false;
if (IsVolatile) {
if (Op == SIMemOp::LOAD)
Changed |= enableGLCBit(MI);
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);
return Changed;
}
if (IsNonTemporal) {
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
Changed |= enableGLCBit(MI);
Changed |= enableSLCBit(MI);
return Changed;
}
return Changed;
}
bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
bool VMCnt = false;
bool LGKMCnt = false;
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
VMCnt |= true;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The L1 cache keeps all memory operations in order for
// wavefronts in the same work-group.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/GDS memory as LDS operations could be
// reordered with respect to later global/GDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The LDS keeps all memory operations in order for
// the same wavesfront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
// is not needed as GDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/LDS memory as GDS operations could be
// reordered with respect to later global/LDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The GDS keeps all memory operations in order for
// the same work-group.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if (VMCnt || LGKMCnt) {
unsigned WaitCntImmediate =
AMDGPU::encodeWaitcnt(IV,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
Changed = true;
}
if (Pos == Position::AFTER)
--MI;
return Changed;
}
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to invalidate.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory cache
/// to be flushed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
return Changed;
}
bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos);
}
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
? AMDGPU::BUFFER_WBINVL1
: AMDGPU::BUFFER_WBINVL1_VOL;
if (Pos == Position::AFTER)
++MI;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to invalidate.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory cache
/// to be flushed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
return Changed;
}
bool SIGfx90ACacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WORKGROUP:
// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to bypass the L1 which is per CU.
// Otherwise in non-threadgroup split mode all waves of a work-group are
// on the same CU, and so the L1 does not need to be bypassed.
if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to bypass.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory caches
/// to be bypassed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
return Changed;
}
bool SIGfx90ACacheControl::enableStoreCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
/// Do not set glc for store atomic operations as they implicitly write
/// through the L1 cache.
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to bypass. Store atomics implicitly write through the L1
// cache.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory caches
/// to be bypassed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
return Changed;
}
bool SIGfx90ACacheControl::enableRMWCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
/// Do not set glc for RMW atomic operations as they implicitly bypass
/// the L1 cache, and the glc bit is instead used to indicate if they are
/// return or no-return.
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to bypass. RMW atomics implicitly bypass the L1 cache.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
return Changed;
}
bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
bool Changed = false;
if (IsVolatile) {
if (Op == SIMemOp::LOAD) {
Changed |= enableGLCBit(MI);
}
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);
return Changed;
}
if (IsNonTemporal) {
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
Changed |= enableGLCBit(MI);
Changed |= enableSLCBit(MI);
return Changed;
}
return Changed;
}
bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
if (ST.isTgSplitEnabled()) {
// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to wait for global or GDS memory operations
// to complete to ensure they are visible to waves in the other CUs.
// Otherwise in non-threadgroup split mode all waves of a work-group are on
// the same CU, so no need to wait for global memory as all waves in the
// work-group access the same the L1, nor wait for GDS as access are ordered
// on a CU.
if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
(Scope == SIAtomicScope::WORKGROUP)) {
// Same as GFX7 using agent scope.
Scope = SIAtomicScope::AGENT;
}
// In threadgroup split mode LDS cannot be allocated so no need to wait for
// LDS memory operations.
AddrSpace &= ~SIAtomicAddrSpace::LDS;
}
return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
IsCrossAddrSpaceOrdering, Pos);
}
bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Ensures that following loads will not see stale remote VMEM data or
// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
// CC will never be stale due to the local memory probes.
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
// hardware does not reorder memory operations by the same wave with
// respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
// remove any cache lines of earlier writes by the same wave and ensures
// later reads by the same wave will refetch the cache lines.
Changed = true;
break;
case SIAtomicScope::AGENT:
// Same as GFX7.
break;
case SIAtomicScope::WORKGROUP:
// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to invalidate the L1 which is per CU.
// Otherwise in non-threadgroup split mode all waves of a work-group are
// on the same CU, and so the L1 does not need to be invalidated.
if (ST.isTgSplitEnabled()) {
// Same as GFX7 using agent scope.
Scope = SIAtomicScope::AGENT;
}
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// Same as GFX7.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory cache
/// to be flushed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
return Changed;
}
bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
// hardware does not reorder memory operations by the same wave with
// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
// to initiate writeback of any dirty cache lines of earlier writes by the
// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
// writeback has completed.
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
// vmcnt(0)" needed by the "BUFFER_WBL2".
Changed = true;
break;
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// Same as GFX7.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if (Pos == Position::AFTER)
--MI;
Changed |=
SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
IsCrossAddrSpaceOrdering, Pos);
return Changed;
}
bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
/// TODO Do not set glc for rmw atomic operations as they
/// implicitly bypass the L0/L1 caches.
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
Changed |= enableGLCBit(MI);
Changed |= enableDLCBit(MI);
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.
if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to bypass.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory caches
/// to be bypassed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
return Changed;
}
bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
bool Changed = false;
if (IsVolatile) {
if (Op == SIMemOp::LOAD) {
Changed |= enableGLCBit(MI);
Changed |= enableDLCBit(MI);
}
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);
return Changed;
}
if (IsNonTemporal) {
// Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
Changed |= enableSLCBit(MI);
return Changed;
}
return Changed;
}
bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
bool VMCnt = false;
bool VSCnt = false;
bool LGKMCnt = false;
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
VSCnt |= true;
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to wait for operations to complete to ensure
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
// which shares the same L0.
if (!ST.isCuModeEnabled()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
VSCnt |= true;
}
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The L0 cache keeps all memory operations in order for
// work-items in the same wavefront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/GDS memory as LDS operations could be
// reordered with respect to later global/GDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The LDS keeps all memory operations in order for
// the same wavesfront.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
// is not needed as GDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
// synchronizing with global/LDS memory as GDS operations could be
// reordered with respect to later global/LDS memory operations of the
// same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// The GDS keeps all memory operations in order for
// the same work-group.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
if (VMCnt || LGKMCnt) {
unsigned WaitCntImmediate =
AMDGPU::encodeWaitcnt(IV,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
Changed = true;
}
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(0);
Changed = true;
}
if (Pos == Position::AFTER)
--MI;
return Changed;
}
bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
if (!InsertCacheInv)
return false;
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
if (Pos == Position::AFTER)
++MI;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
// in CU mode and all waves of a work-group are on the same CU, and so the
// L0 does not need to be invalidated.
if (!ST.isCuModeEnabled()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
Changed = true;
}
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to invalidate.
break;
default:
llvm_unreachable("Unsupported synchronization scope");
}
}
/// The scratch address space does not need the global memory cache
/// to be flushed as all memory operations by the same thread are
/// sequentially consistent, and no other thread can access scratch
/// memory.
/// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
return Changed;
}
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
for (auto &MI : AtomicPseudoMIs)
MI->eraseFromParent();
AtomicPseudoMIs.clear();
return true;
}
bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
MOI.getOrderingAddrSpace());
}
if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertWait(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->insertWait(MI, MOI.getScope(),
MOI.getInstrAddrSpace(),
SIMemOp::LOAD,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
}
return Changed;
}
// Atomic instructions already bypass caches to the scope specified by the
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
// need additional treatment.
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
SIMemOp::LOAD, MOI.isVolatile(),
MOI.isNonTemporal());
return Changed;
}
bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
MOI.getOrderingAddrSpace());
}
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
return Changed;
}
// Atomic instructions already bypass caches to the scope specified by the
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
// need additional treatment.
Changed |= CC->enableVolatileAndOrNonTemporal(
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
MOI.isNonTemporal());
return Changed;
}
bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
AtomicPseudoMIs.push_back(MI);
bool Changed = false;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
/// TODO: This relies on a barrier always generating a waitcnt
/// for LDS to ensure it is not reordered with the completion of
/// the proceeding LDS operations. If barrier had a memory
/// ordering and memory scope, then library does not need to
/// generate a fence. Could add support in this file for
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
/// adding S_WAITCNT before a S_BARRIER.
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
// TODO: If both release and invalidate are happening they could be combined
// to use the single "BUFFER_WBINV*" instruction. This could be done by
// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
// track cache invalidate and write back instructions.
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::BEFORE);
return Changed;
}
return Changed;
}
bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
MOI.getInstrAddrSpace());
}
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->insertWait(MI, MOI.getScope(),
MOI.getInstrAddrSpace(),
isAtomicRet(*MI) ? SIMemOp::LOAD :
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
}
return Changed;
}
return Changed;
}
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
SIMemOpAccess MOA(MF);
CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
// Unbundle instructions after the post-RA scheduler.
if (MI->isBundle() && MI->mayLoadOrStore()) {
MachineBasicBlock::instr_iterator II(MI->getIterator());
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
I != E && I->isBundledWithPred(); ++I) {
I->unbundleFromPred();
for (MachineOperand &MO : I->operands())
if (MO.isReg())
MO.setIsInternalRead(false);
}
MI->eraseFromParent();
MI = II->getIterator();
}
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
if (const auto &MOI = MOA.getLoadInfo(MI))
Changed |= expandLoad(MOI.getValue(), MI);
else if (const auto &MOI = MOA.getStoreInfo(MI))
Changed |= expandStore(MOI.getValue(), MI);
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
Changed |= expandAtomicFence(MOI.getValue(), MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
}
}
Changed |= removeAtomicPseudoMIs();
return Changed;
}
INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
char SIMemoryLegalizer::ID = 0;
char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
FunctionPass *llvm::createSIMemoryLegalizerPass() {
return new SIMemoryLegalizer();
}