
Use a local pointer type to represent the named barrier in builtin and intrinsic. This makes the definitions more user friendly bacause they do not need to worry about the hardware ID assignment. Also this approach is more like the other popular GPU programming language. Named barriers should be represented as global variables of addrspace(3) in LLVM-IR. Compiler assigns the special LDS offsets for those variables during AMDGPULowerModuleLDS pass. Those addresses are converted to hw barrier ID during instruction selection. The rest of the instruction-selection changes are primarily due to the intrinsic-definition changes.
7584 lines
270 KiB
C++
7584 lines
270 KiB
C++
//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
/// \file
|
|
/// This file implements the targeting of the Machinelegalizer class for
|
|
/// AMDGPU.
|
|
/// \todo This should be generated by TableGen.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPULegalizerInfo.h"
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUGlobalISelUtils.h"
|
|
#include "AMDGPUInstrInfo.h"
|
|
#include "AMDGPUMemoryUtils.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "SIMachineFunctionInfo.h"
|
|
#include "SIRegisterInfo.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/ScopeExit.h"
|
|
#include "llvm/BinaryFormat/ELF.h"
|
|
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
|
|
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
|
|
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
|
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
|
|
#include "llvm/CodeGen/GlobalISel/Utils.h"
|
|
#include "llvm/CodeGen/TargetOpcodes.h"
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/IntrinsicsR600.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-legalinfo"
|
|
|
|
using namespace llvm;
|
|
using namespace LegalizeActions;
|
|
using namespace LegalizeMutations;
|
|
using namespace LegalityPredicates;
|
|
using namespace MIPatternMatch;
|
|
|
|
// Hack until load/store selection patterns support any tuple of legal types.
|
|
static cl::opt<bool> EnableNewLegality(
|
|
"amdgpu-global-isel-new-legality",
|
|
cl::desc("Use GlobalISel desired legality, rather than try to use"
|
|
"rules compatible with selection patterns"),
|
|
cl::init(false),
|
|
cl::ReallyHidden);
|
|
|
|
static constexpr unsigned MaxRegisterSize = 1024;
|
|
|
|
// Round the number of elements to the next power of two elements
|
|
static LLT getPow2VectorType(LLT Ty) {
|
|
unsigned NElts = Ty.getNumElements();
|
|
unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
|
|
return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
|
|
}
|
|
|
|
// Round the number of bits to the next power of two bits
|
|
static LLT getPow2ScalarType(LLT Ty) {
|
|
unsigned Bits = Ty.getSizeInBits();
|
|
unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
|
|
return LLT::scalar(Pow2Bits);
|
|
}
|
|
|
|
/// \returns true if this is an odd sized vector which should widen by adding an
|
|
/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
|
|
/// excludes s1 vectors, which should always be scalarized.
|
|
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
if (!Ty.isVector())
|
|
return false;
|
|
|
|
const LLT EltTy = Ty.getElementType();
|
|
const unsigned EltSize = EltTy.getSizeInBits();
|
|
return Ty.getNumElements() % 2 != 0 &&
|
|
EltSize > 1 && EltSize < 32 &&
|
|
Ty.getSizeInBits() % 32 != 0;
|
|
};
|
|
}
|
|
|
|
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
return Ty.getSizeInBits() % 32 == 0;
|
|
};
|
|
}
|
|
|
|
static LegalityPredicate isWideVec16(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
const LLT EltTy = Ty.getScalarType();
|
|
return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
|
|
};
|
|
}
|
|
|
|
static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
const LLT EltTy = Ty.getElementType();
|
|
return std::pair(TypeIdx,
|
|
LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
|
|
};
|
|
}
|
|
|
|
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
const LLT EltTy = Ty.getElementType();
|
|
unsigned Size = Ty.getSizeInBits();
|
|
unsigned Pieces = (Size + 63) / 64;
|
|
unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
|
|
return std::pair(TypeIdx, LLT::scalarOrVector(
|
|
ElementCount::getFixed(NewNumElts), EltTy));
|
|
};
|
|
}
|
|
|
|
// Increase the number of vector elements to reach the next multiple of 32-bit
|
|
// type.
|
|
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
|
|
const LLT EltTy = Ty.getElementType();
|
|
const int Size = Ty.getSizeInBits();
|
|
const int EltSize = EltTy.getSizeInBits();
|
|
const int NextMul32 = (Size + 31) / 32;
|
|
|
|
assert(EltSize < 32);
|
|
|
|
const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
|
|
return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
|
|
};
|
|
}
|
|
|
|
// Increase the number of vector elements to reach the next legal RegClass.
|
|
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
const unsigned NumElts = Ty.getNumElements();
|
|
const unsigned EltSize = Ty.getElementType().getSizeInBits();
|
|
const unsigned MaxNumElts = MaxRegisterSize / EltSize;
|
|
|
|
assert(EltSize == 32 || EltSize == 64);
|
|
assert(Ty.getSizeInBits() < MaxRegisterSize);
|
|
|
|
unsigned NewNumElts;
|
|
// Find the nearest legal RegClass that is larger than the current type.
|
|
for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
|
|
if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
|
|
break;
|
|
}
|
|
|
|
return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
|
|
};
|
|
}
|
|
|
|
static LLT getBufferRsrcScalarType(const LLT Ty) {
|
|
if (!Ty.isVector())
|
|
return LLT::scalar(128);
|
|
const ElementCount NumElems = Ty.getElementCount();
|
|
return LLT::vector(NumElems, LLT::scalar(128));
|
|
}
|
|
|
|
static LLT getBufferRsrcRegisterType(const LLT Ty) {
|
|
if (!Ty.isVector())
|
|
return LLT::fixed_vector(4, LLT::scalar(32));
|
|
const unsigned NumElems = Ty.getElementCount().getFixedValue();
|
|
return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
|
|
}
|
|
|
|
static LLT getBitcastRegisterType(const LLT Ty) {
|
|
const unsigned Size = Ty.getSizeInBits();
|
|
|
|
if (Size <= 32) {
|
|
// <2 x s8> -> s16
|
|
// <4 x s8> -> s32
|
|
return LLT::scalar(Size);
|
|
}
|
|
|
|
return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
|
|
}
|
|
|
|
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
return std::pair(TypeIdx, getBitcastRegisterType(Ty));
|
|
};
|
|
}
|
|
|
|
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
unsigned Size = Ty.getSizeInBits();
|
|
assert(Size % 32 == 0);
|
|
return std::pair(
|
|
TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
|
|
};
|
|
}
|
|
|
|
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
|
|
};
|
|
}
|
|
|
|
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
|
|
};
|
|
}
|
|
|
|
static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
|
|
};
|
|
}
|
|
|
|
static bool isRegisterSize(unsigned Size) {
|
|
return Size % 32 == 0 && Size <= MaxRegisterSize;
|
|
}
|
|
|
|
static bool isRegisterVectorElementType(LLT EltTy) {
|
|
const int EltSize = EltTy.getSizeInBits();
|
|
return EltSize == 16 || EltSize % 32 == 0;
|
|
}
|
|
|
|
static bool isRegisterVectorType(LLT Ty) {
|
|
const int EltSize = Ty.getElementType().getSizeInBits();
|
|
return EltSize == 32 || EltSize == 64 ||
|
|
(EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
|
|
EltSize == 128 || EltSize == 256;
|
|
}
|
|
|
|
// TODO: replace all uses of isRegisterType with isRegisterClassType
|
|
static bool isRegisterType(LLT Ty) {
|
|
if (!isRegisterSize(Ty.getSizeInBits()))
|
|
return false;
|
|
|
|
if (Ty.isVector())
|
|
return isRegisterVectorType(Ty);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Any combination of 32 or 64-bit elements up the maximum register size, and
|
|
// multiples of v2s16.
|
|
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
return isRegisterType(Query.Types[TypeIdx]);
|
|
};
|
|
}
|
|
|
|
// RegisterType that doesn't have a corresponding RegClass.
|
|
// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
|
|
// should be removed.
|
|
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
LLT Ty = Query.Types[TypeIdx];
|
|
return isRegisterType(Ty) &&
|
|
!SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
|
|
};
|
|
}
|
|
|
|
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT QueryTy = Query.Types[TypeIdx];
|
|
if (!QueryTy.isVector())
|
|
return false;
|
|
const LLT EltTy = QueryTy.getElementType();
|
|
return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
|
|
};
|
|
}
|
|
|
|
static const LLT S1 = LLT::scalar(1);
|
|
static const LLT S8 = LLT::scalar(8);
|
|
static const LLT S16 = LLT::scalar(16);
|
|
static const LLT S32 = LLT::scalar(32);
|
|
static const LLT F32 = LLT::float32();
|
|
static const LLT S64 = LLT::scalar(64);
|
|
static const LLT F64 = LLT::float64();
|
|
static const LLT S96 = LLT::scalar(96);
|
|
static const LLT S128 = LLT::scalar(128);
|
|
static const LLT S160 = LLT::scalar(160);
|
|
static const LLT S192 = LLT::scalar(192);
|
|
static const LLT S224 = LLT::scalar(224);
|
|
static const LLT S256 = LLT::scalar(256);
|
|
static const LLT S512 = LLT::scalar(512);
|
|
static const LLT S1024 = LLT::scalar(1024);
|
|
static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
|
|
|
|
static const LLT V2S8 = LLT::fixed_vector(2, 8);
|
|
static const LLT V2S16 = LLT::fixed_vector(2, 16);
|
|
static const LLT V4S16 = LLT::fixed_vector(4, 16);
|
|
static const LLT V6S16 = LLT::fixed_vector(6, 16);
|
|
static const LLT V8S16 = LLT::fixed_vector(8, 16);
|
|
static const LLT V10S16 = LLT::fixed_vector(10, 16);
|
|
static const LLT V12S16 = LLT::fixed_vector(12, 16);
|
|
static const LLT V16S16 = LLT::fixed_vector(16, 16);
|
|
|
|
static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
|
|
static const LLT V2BF16 = V2F16; // FIXME
|
|
|
|
static const LLT V2S32 = LLT::fixed_vector(2, 32);
|
|
static const LLT V3S32 = LLT::fixed_vector(3, 32);
|
|
static const LLT V4S32 = LLT::fixed_vector(4, 32);
|
|
static const LLT V5S32 = LLT::fixed_vector(5, 32);
|
|
static const LLT V6S32 = LLT::fixed_vector(6, 32);
|
|
static const LLT V7S32 = LLT::fixed_vector(7, 32);
|
|
static const LLT V8S32 = LLT::fixed_vector(8, 32);
|
|
static const LLT V9S32 = LLT::fixed_vector(9, 32);
|
|
static const LLT V10S32 = LLT::fixed_vector(10, 32);
|
|
static const LLT V11S32 = LLT::fixed_vector(11, 32);
|
|
static const LLT V12S32 = LLT::fixed_vector(12, 32);
|
|
static const LLT V16S32 = LLT::fixed_vector(16, 32);
|
|
static const LLT V32S32 = LLT::fixed_vector(32, 32);
|
|
|
|
static const LLT V2S64 = LLT::fixed_vector(2, 64);
|
|
static const LLT V3S64 = LLT::fixed_vector(3, 64);
|
|
static const LLT V4S64 = LLT::fixed_vector(4, 64);
|
|
static const LLT V5S64 = LLT::fixed_vector(5, 64);
|
|
static const LLT V6S64 = LLT::fixed_vector(6, 64);
|
|
static const LLT V7S64 = LLT::fixed_vector(7, 64);
|
|
static const LLT V8S64 = LLT::fixed_vector(8, 64);
|
|
static const LLT V16S64 = LLT::fixed_vector(16, 64);
|
|
|
|
static const LLT V2S128 = LLT::fixed_vector(2, 128);
|
|
static const LLT V4S128 = LLT::fixed_vector(4, 128);
|
|
|
|
static std::initializer_list<LLT> AllScalarTypes = {
|
|
S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
|
|
|
|
static std::initializer_list<LLT> AllS16Vectors{
|
|
V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
|
|
|
|
static std::initializer_list<LLT> AllS32Vectors = {
|
|
V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
|
|
V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
|
|
|
|
static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
|
|
V6S64, V7S64, V8S64, V16S64};
|
|
|
|
// Checks whether a type is in the list of legal register types.
|
|
static bool isRegisterClassType(LLT Ty) {
|
|
if (Ty.isPointerOrPointerVector())
|
|
Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
|
|
|
|
return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
|
|
is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
|
|
}
|
|
|
|
static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
|
|
return [TypeIdx](const LegalityQuery &Query) {
|
|
return isRegisterClassType(Query.Types[TypeIdx]);
|
|
};
|
|
}
|
|
|
|
// If we have a truncating store or an extending load with a data size larger
|
|
// than 32-bits, we need to reduce to a 32-bit type.
|
|
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
|
|
return [=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
|
|
Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
|
|
};
|
|
}
|
|
|
|
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
|
|
// handle some operations by just promoting the register during
|
|
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
|
|
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
|
|
bool IsLoad, bool IsAtomic) {
|
|
switch (AS) {
|
|
case AMDGPUAS::PRIVATE_ADDRESS:
|
|
// FIXME: Private element size.
|
|
return ST.enableFlatScratch() ? 128 : 32;
|
|
case AMDGPUAS::LOCAL_ADDRESS:
|
|
return ST.useDS128() ? 128 : 64;
|
|
case AMDGPUAS::GLOBAL_ADDRESS:
|
|
case AMDGPUAS::CONSTANT_ADDRESS:
|
|
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
|
|
case AMDGPUAS::BUFFER_RESOURCE:
|
|
// Treat constant and global as identical. SMRD loads are sometimes usable for
|
|
// global loads (ideally constant address space should be eliminated)
|
|
// depending on the context. Legality cannot be context dependent, but
|
|
// RegBankSelect can split the load as necessary depending on the pointer
|
|
// register bank/uniformity and if the memory is invariant or not written in a
|
|
// kernel.
|
|
return IsLoad ? 512 : 128;
|
|
default:
|
|
// FIXME: Flat addresses may contextually need to be split to 32-bit parts
|
|
// if they may alias scratch depending on the subtarget. This needs to be
|
|
// moved to custom handling to use addressMayBeAccessedAsPrivate
|
|
return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
|
|
}
|
|
}
|
|
|
|
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
|
|
const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[0];
|
|
|
|
// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
|
|
const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
|
|
|
|
unsigned RegSize = Ty.getSizeInBits();
|
|
uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
|
|
uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
|
|
unsigned AS = Query.Types[1].getAddressSpace();
|
|
|
|
// All of these need to be custom lowered to cast the pointer operand.
|
|
if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
|
|
return false;
|
|
|
|
// Do not handle extending vector loads.
|
|
if (Ty.isVector() && MemSize != RegSize)
|
|
return false;
|
|
|
|
// TODO: We should be able to widen loads if the alignment is high enough, but
|
|
// we also need to modify the memory access size.
|
|
#if 0
|
|
// Accept widening loads based on alignment.
|
|
if (IsLoad && MemSize < Size)
|
|
MemSize = std::max(MemSize, Align);
|
|
#endif
|
|
|
|
// Only 1-byte and 2-byte to 32-bit extloads are valid.
|
|
if (MemSize != RegSize && RegSize != 32)
|
|
return false;
|
|
|
|
if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
|
|
Query.MMODescrs[0].Ordering !=
|
|
AtomicOrdering::NotAtomic))
|
|
return false;
|
|
|
|
switch (MemSize) {
|
|
case 8:
|
|
case 16:
|
|
case 32:
|
|
case 64:
|
|
case 128:
|
|
break;
|
|
case 96:
|
|
if (!ST.hasDwordx3LoadStores())
|
|
return false;
|
|
break;
|
|
case 256:
|
|
case 512:
|
|
// These may contextually need to be broken down.
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
assert(RegSize >= MemSize);
|
|
|
|
if (AlignBits < MemSize) {
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
|
|
Align(AlignBits / 8)))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// The newer buffer intrinsic forms take their resource arguments as
|
|
// pointers in address space 8, aka s128 values. However, in order to not break
|
|
// SelectionDAG, the underlying operations have to continue to take v4i32
|
|
// arguments. Therefore, we convert resource pointers - or vectors of them
|
|
// to integer values here.
|
|
static bool hasBufferRsrcWorkaround(const LLT Ty) {
|
|
if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
|
|
return true;
|
|
if (Ty.isVector()) {
|
|
const LLT ElemTy = Ty.getElementType();
|
|
return hasBufferRsrcWorkaround(ElemTy);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
|
|
// workaround this. Eventually it should ignore the type for loads and only care
|
|
// about the size. Return true in cases where we will workaround this for now by
|
|
// bitcasting.
|
|
static bool loadStoreBitcastWorkaround(const LLT Ty) {
|
|
if (EnableNewLegality)
|
|
return false;
|
|
|
|
const unsigned Size = Ty.getSizeInBits();
|
|
if (Ty.isPointerVector())
|
|
return true;
|
|
if (Size <= 64)
|
|
return false;
|
|
// Address space 8 pointers get their own workaround.
|
|
if (hasBufferRsrcWorkaround(Ty))
|
|
return false;
|
|
if (!Ty.isVector())
|
|
return true;
|
|
|
|
unsigned EltSize = Ty.getScalarSizeInBits();
|
|
return EltSize != 32 && EltSize != 64;
|
|
}
|
|
|
|
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[0];
|
|
return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
|
|
!hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
|
|
}
|
|
|
|
/// Return true if a load or store of the type should be lowered with a bitcast
|
|
/// to a different type.
|
|
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
|
|
const LLT MemTy) {
|
|
const unsigned MemSizeInBits = MemTy.getSizeInBits();
|
|
const unsigned Size = Ty.getSizeInBits();
|
|
if (Size != MemSizeInBits)
|
|
return Size <= 32 && Ty.isVector();
|
|
|
|
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
|
|
return true;
|
|
|
|
// Don't try to handle bitcasting vector ext loads for now.
|
|
return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
|
|
(Size <= 32 || isRegisterSize(Size)) &&
|
|
!isRegisterVectorElementType(Ty.getElementType());
|
|
}
|
|
|
|
/// Return true if we should legalize a load by widening an odd sized memory
|
|
/// access up to the alignment. Note this case when the memory access itself
|
|
/// changes, not the size of the result register.
|
|
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
|
|
uint64_t AlignInBits, unsigned AddrSpace,
|
|
unsigned Opcode) {
|
|
unsigned SizeInBits = MemoryTy.getSizeInBits();
|
|
// We don't want to widen cases that are naturally legal.
|
|
if (isPowerOf2_32(SizeInBits))
|
|
return false;
|
|
|
|
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
|
|
// end up widening these for a scalar load during RegBankSelect, if we don't
|
|
// have 96-bit scalar loads.
|
|
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
|
|
return false;
|
|
|
|
if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
|
|
return false;
|
|
|
|
// A load is known dereferenceable up to the alignment, so it's legal to widen
|
|
// to it.
|
|
//
|
|
// TODO: Could check dereferenceable for less aligned cases.
|
|
unsigned RoundedSize = NextPowerOf2(SizeInBits);
|
|
if (AlignInBits < RoundedSize)
|
|
return false;
|
|
|
|
// Do not widen if it would introduce a slow unaligned load.
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
unsigned Fast = 0;
|
|
return TLI->allowsMisalignedMemoryAccessesImpl(
|
|
RoundedSize, AddrSpace, Align(AlignInBits / 8),
|
|
MachineMemOperand::MOLoad, &Fast) &&
|
|
Fast;
|
|
}
|
|
|
|
static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
|
|
unsigned Opcode) {
|
|
if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
|
|
return false;
|
|
|
|
return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
|
|
Query.MMODescrs[0].AlignInBits,
|
|
Query.Types[1].getAddressSpace(), Opcode);
|
|
}
|
|
|
|
/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
|
|
/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
|
|
/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
|
|
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
|
|
MachineRegisterInfo &MRI, unsigned Idx) {
|
|
MachineOperand &MO = MI.getOperand(Idx);
|
|
|
|
const LLT PointerTy = MRI.getType(MO.getReg());
|
|
|
|
// Paranoidly prevent us from doing this multiple times.
|
|
if (!hasBufferRsrcWorkaround(PointerTy))
|
|
return PointerTy;
|
|
|
|
const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
|
|
const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
|
|
if (!PointerTy.isVector()) {
|
|
// Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
|
|
const unsigned NumParts = PointerTy.getSizeInBits() / 32;
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
|
|
std::array<Register, 4> VectorElems;
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
|
for (unsigned I = 0; I < NumParts; ++I)
|
|
VectorElems[I] =
|
|
B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
|
|
B.buildMergeValues(MO, VectorElems);
|
|
MO.setReg(VectorReg);
|
|
return VectorTy;
|
|
}
|
|
Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
|
auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
|
|
B.buildIntToPtr(MO, Scalar);
|
|
MO.setReg(BitcastReg);
|
|
|
|
return VectorTy;
|
|
}
|
|
|
|
/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
|
|
/// the form in which the value must be in order to be passed to the low-level
|
|
/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
|
|
/// needed in order to account for the fact that we can't define a register
|
|
/// class for s128 without breaking SelectionDAG.
|
|
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
const LLT PointerTy = MRI.getType(Pointer);
|
|
const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
|
|
const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
|
|
|
|
if (!PointerTy.isVector()) {
|
|
// Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
|
|
SmallVector<Register, 4> PointerParts;
|
|
const unsigned NumParts = PointerTy.getSizeInBits() / 32;
|
|
auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
|
|
for (unsigned I = 0; I < NumParts; ++I)
|
|
PointerParts.push_back(Unmerged.getReg(I));
|
|
return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
|
|
}
|
|
Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
|
|
return B.buildBitcast(VectorTy, Scalar).getReg(0);
|
|
}
|
|
|
|
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
|
|
unsigned Idx) {
|
|
MachineOperand &MO = MI.getOperand(Idx);
|
|
|
|
const LLT PointerTy = B.getMRI()->getType(MO.getReg());
|
|
// Paranoidly prevent us from doing this multiple times.
|
|
if (!hasBufferRsrcWorkaround(PointerTy))
|
|
return;
|
|
MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
|
|
}
|
|
|
|
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
|
const GCNTargetMachine &TM)
|
|
: ST(ST_) {
|
|
using namespace TargetOpcode;
|
|
|
|
auto GetAddrSpacePtr = [&TM](unsigned AS) {
|
|
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
|
|
};
|
|
|
|
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
|
|
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
|
|
const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
|
|
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
|
|
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
|
|
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
|
|
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
|
|
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
|
|
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
|
|
const LLT BufferStridedPtr =
|
|
GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
|
|
|
|
const LLT CodePtr = FlatPtr;
|
|
|
|
const std::initializer_list<LLT> AddrSpaces64 = {
|
|
GlobalPtr, ConstantPtr, FlatPtr
|
|
};
|
|
|
|
const std::initializer_list<LLT> AddrSpaces32 = {
|
|
LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
|
|
};
|
|
|
|
const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
|
|
|
|
const std::initializer_list<LLT> FPTypesBase = {
|
|
S32, S64
|
|
};
|
|
|
|
const std::initializer_list<LLT> FPTypes16 = {
|
|
S32, S64, S16
|
|
};
|
|
|
|
const std::initializer_list<LLT> FPTypesPK16 = {
|
|
S32, S64, S16, V2S16
|
|
};
|
|
|
|
const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
|
|
|
|
// s1 for VCC branches, s32 for SCC branches.
|
|
getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
|
|
|
|
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
|
|
// elements for v3s16
|
|
getActionDefinitionsBuilder(G_PHI)
|
|
.legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
|
|
.legalFor(AllS32Vectors)
|
|
.legalFor(AllS64Vectors)
|
|
.legalFor(AddrSpaces64)
|
|
.legalFor(AddrSpaces32)
|
|
.legalFor(AddrSpaces128)
|
|
.legalIf(isPointer(0))
|
|
.clampScalar(0, S16, S256)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.clampMaxNumElements(0, S32, 16)
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
.scalarize(0);
|
|
|
|
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
|
|
// Full set of gfx9 features.
|
|
if (ST.hasScalarAddSub64()) {
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB})
|
|
.legalFor({S64, S32, S16, V2S16})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.maxScalar(0, S32);
|
|
} else {
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB})
|
|
.legalFor({S32, S16, V2S16})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.maxScalar(0, S32);
|
|
}
|
|
|
|
if (ST.hasScalarSMulU64()) {
|
|
getActionDefinitionsBuilder(G_MUL)
|
|
.legalFor({S64, S32, S16, V2S16})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.custom();
|
|
} else {
|
|
getActionDefinitionsBuilder(G_MUL)
|
|
.legalFor({S32, S16, V2S16})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.custom();
|
|
}
|
|
assert(ST.hasMad64_32());
|
|
|
|
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
|
|
.legalFor({S32, S16, V2S16}) // Clamp modifier
|
|
.minScalarOrElt(0, S16)
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.lower();
|
|
} else if (ST.has16BitInsts()) {
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB})
|
|
.legalFor({S32, S16})
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.maxScalar(0, S32)
|
|
.scalarize(0);
|
|
|
|
getActionDefinitionsBuilder(G_MUL)
|
|
.legalFor({S32, S16})
|
|
.scalarize(0)
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.custom();
|
|
assert(ST.hasMad64_32());
|
|
|
|
// Technically the saturating operations require clamp bit support, but this
|
|
// was introduced at the same time as 16-bit operations.
|
|
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
|
|
.legalFor({S32, S16}) // Clamp modifier
|
|
.minScalar(0, S16)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0, 16)
|
|
.lower();
|
|
|
|
// We're just lowering this, but it helps get a better result to try to
|
|
// coerce to the desired type first.
|
|
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
|
|
.minScalar(0, S16)
|
|
.scalarize(0)
|
|
.lower();
|
|
} else {
|
|
getActionDefinitionsBuilder({G_ADD, G_SUB})
|
|
.legalFor({S32})
|
|
.widenScalarToNextMultipleOf(0, 32)
|
|
.clampScalar(0, S32, S32)
|
|
.scalarize(0);
|
|
|
|
auto &Mul = getActionDefinitionsBuilder(G_MUL)
|
|
.legalFor({S32})
|
|
.scalarize(0)
|
|
.minScalar(0, S32)
|
|
.widenScalarToNextMultipleOf(0, 32);
|
|
|
|
if (ST.hasMad64_32())
|
|
Mul.custom();
|
|
else
|
|
Mul.maxScalar(0, S32);
|
|
|
|
if (ST.hasIntClamp()) {
|
|
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
|
|
.legalFor({S32}) // Clamp modifier.
|
|
.scalarize(0)
|
|
.minScalarOrElt(0, S32)
|
|
.lower();
|
|
} else {
|
|
// Clamp bit support was added in VI, along with 16-bit operations.
|
|
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
|
|
.minScalar(0, S32)
|
|
.scalarize(0)
|
|
.lower();
|
|
}
|
|
|
|
// FIXME: DAG expansion gets better results. The widening uses the smaller
|
|
// range values and goes for the min/max lowering directly.
|
|
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
|
|
.minScalar(0, S32)
|
|
.scalarize(0)
|
|
.lower();
|
|
}
|
|
|
|
getActionDefinitionsBuilder(
|
|
{G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
|
|
.customFor({S32, S64})
|
|
.clampScalar(0, S32, S64)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.scalarize(0);
|
|
|
|
auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
|
|
.legalFor({S32})
|
|
.maxScalar(0, S32);
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
Mulh
|
|
.clampMaxNumElements(0, S8, 2)
|
|
.lowerFor({V2S8});
|
|
}
|
|
|
|
Mulh
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
// Report legal for any types we can handle anywhere. For the cases only legal
|
|
// on the SALU, RegBankSelect will be able to re-legalize.
|
|
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
|
|
.legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
|
|
.clampScalar(0, S32, S64)
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
.fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
|
|
.widenScalarToNextPow2(0)
|
|
.scalarize(0);
|
|
|
|
getActionDefinitionsBuilder(
|
|
{G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
|
|
.legalFor({{S32, S1}, {S32, S32}})
|
|
.clampScalar(0, S32, S32)
|
|
.scalarize(0);
|
|
|
|
getActionDefinitionsBuilder(G_BITCAST)
|
|
// Don't worry about the size constraint.
|
|
.legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder(G_CONSTANT)
|
|
.legalFor({S1, S32, S64, S16, GlobalPtr,
|
|
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
|
|
.legalIf(isPointer(0))
|
|
.clampScalar(0, S32, S64)
|
|
.widenScalarToNextPow2(0);
|
|
|
|
getActionDefinitionsBuilder(G_FCONSTANT)
|
|
.legalFor({S32, S64, S16})
|
|
.clampScalar(0, S16, S64);
|
|
|
|
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
|
|
.legalIf(isRegisterClassType(0))
|
|
// s1 and s16 are special cases because they have legal operations on
|
|
// them, but don't really occupy registers in the normal way.
|
|
.legalFor({S1, S16})
|
|
.clampNumElements(0, V16S32, V32S32)
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
.clampScalarOrElt(0, S32, MaxScalar)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.clampMaxNumElements(0, S32, 16);
|
|
|
|
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
|
|
|
|
// If the amount is divergent, we have to do a wave reduction to get the
|
|
// maximum value, so this is expanded during RegBankSelect.
|
|
getActionDefinitionsBuilder(G_DYN_STACKALLOC)
|
|
.legalFor({{PrivatePtr, S32}});
|
|
|
|
getActionDefinitionsBuilder(G_STACKSAVE)
|
|
.customFor({PrivatePtr});
|
|
getActionDefinitionsBuilder(G_STACKRESTORE)
|
|
.legalFor({PrivatePtr});
|
|
|
|
getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
|
|
|
|
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
|
|
.customIf(typeIsNot(0, PrivatePtr));
|
|
|
|
getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
|
|
|
|
auto &FPOpActions = getActionDefinitionsBuilder(
|
|
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
|
|
G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
|
|
.legalFor({S32, S64});
|
|
auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
|
|
.customFor({S32, S64});
|
|
auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
|
|
.customFor({S32, S64});
|
|
|
|
if (ST.has16BitInsts()) {
|
|
if (ST.hasVOP3PInsts())
|
|
FPOpActions.legalFor({S16, V2S16});
|
|
else
|
|
FPOpActions.legalFor({S16});
|
|
|
|
TrigActions.customFor({S16});
|
|
FDIVActions.customFor({S16});
|
|
}
|
|
|
|
if (ST.hasPackedFP32Ops()) {
|
|
FPOpActions.legalFor({V2S32});
|
|
FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
|
|
}
|
|
|
|
auto &MinNumMaxNum = getActionDefinitionsBuilder({
|
|
G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
MinNumMaxNum.customFor(FPTypesPK16)
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
.clampMaxNumElements(0, S16, 2)
|
|
.clampScalar(0, S16, S64)
|
|
.scalarize(0);
|
|
} else if (ST.has16BitInsts()) {
|
|
MinNumMaxNum.customFor(FPTypes16)
|
|
.clampScalar(0, S16, S64)
|
|
.scalarize(0);
|
|
} else {
|
|
MinNumMaxNum.customFor(FPTypesBase)
|
|
.clampScalar(0, S32, S64)
|
|
.scalarize(0);
|
|
}
|
|
|
|
if (ST.hasVOP3PInsts())
|
|
FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
|
|
|
|
FPOpActions
|
|
.scalarize(0)
|
|
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
|
|
|
|
TrigActions
|
|
.scalarize(0)
|
|
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
|
|
|
|
FDIVActions
|
|
.scalarize(0)
|
|
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
|
|
|
|
getActionDefinitionsBuilder({G_FNEG, G_FABS})
|
|
.legalFor(FPTypesPK16)
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.clampScalar(0, S16, S64);
|
|
|
|
if (ST.has16BitInsts()) {
|
|
getActionDefinitionsBuilder(G_FSQRT)
|
|
.legalFor({S16})
|
|
.customFor({S32, S64})
|
|
.scalarize(0)
|
|
.unsupported();
|
|
getActionDefinitionsBuilder(G_FFLOOR)
|
|
.legalFor({S32, S64, S16})
|
|
.scalarize(0)
|
|
.clampScalar(0, S16, S64);
|
|
|
|
getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
|
|
.legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
|
|
.scalarize(0)
|
|
.maxScalarIf(typeIs(0, S16), 1, S16)
|
|
.clampScalar(1, S32, S32)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder(G_FFREXP)
|
|
.customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
|
|
.scalarize(0)
|
|
.lower();
|
|
} else {
|
|
getActionDefinitionsBuilder(G_FSQRT)
|
|
.customFor({S32, S64, S16})
|
|
.scalarize(0)
|
|
.unsupported();
|
|
|
|
|
|
if (ST.hasFractBug()) {
|
|
getActionDefinitionsBuilder(G_FFLOOR)
|
|
.customFor({S64})
|
|
.legalFor({S32, S64})
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S64);
|
|
} else {
|
|
getActionDefinitionsBuilder(G_FFLOOR)
|
|
.legalFor({S32, S64})
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S64);
|
|
}
|
|
|
|
getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
|
|
.legalFor({{S32, S32}, {S64, S32}})
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S64)
|
|
.clampScalar(1, S32, S32)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder(G_FFREXP)
|
|
.customFor({{S32, S32}, {S64, S32}})
|
|
.scalarize(0)
|
|
.minScalar(0, S32)
|
|
.clampScalar(1, S32, S32)
|
|
.lower();
|
|
}
|
|
|
|
getActionDefinitionsBuilder(G_FPTRUNC)
|
|
.legalFor({{S32, S64}, {S16, S32}})
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder(G_FPEXT)
|
|
.legalFor({{S64, S32}, {S32, S16}})
|
|
.narrowScalarFor({{S64, S16}}, changeTo(0, S32))
|
|
.scalarize(0);
|
|
|
|
auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
|
|
if (ST.has16BitInsts()) {
|
|
FSubActions
|
|
// Use actual fsub instruction
|
|
.legalFor({S32, S16})
|
|
// Must use fadd + fneg
|
|
.lowerFor({S64, V2S16});
|
|
} else {
|
|
FSubActions
|
|
// Use actual fsub instruction
|
|
.legalFor({S32})
|
|
// Must use fadd + fneg
|
|
.lowerFor({S64, S16, V2S16});
|
|
}
|
|
|
|
FSubActions
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S64);
|
|
|
|
// Whether this is legal depends on the floating point mode for the function.
|
|
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
|
|
if (ST.hasMadF16() && ST.hasMadMacF32Insts())
|
|
FMad.customFor({S32, S16});
|
|
else if (ST.hasMadMacF32Insts())
|
|
FMad.customFor({S32});
|
|
else if (ST.hasMadF16())
|
|
FMad.customFor({S16});
|
|
FMad.scalarize(0)
|
|
.lower();
|
|
|
|
auto &FRem = getActionDefinitionsBuilder(G_FREM);
|
|
if (ST.has16BitInsts()) {
|
|
FRem.customFor({S16, S32, S64});
|
|
} else {
|
|
FRem.minScalar(0, S32)
|
|
.customFor({S32, S64});
|
|
}
|
|
FRem.scalarize(0);
|
|
|
|
// TODO: Do we need to clamp maximum bitwidth?
|
|
getActionDefinitionsBuilder(G_TRUNC)
|
|
.legalIf(isScalar(0))
|
|
.legalFor({{V2S16, V2S32}})
|
|
.clampMaxNumElements(0, S16, 2)
|
|
// Avoid scalarizing in cases that should be truly illegal. In unresolvable
|
|
// situations (like an invalid implicit use), we don't want to infinite loop
|
|
// in the legalizer.
|
|
.fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
|
|
.alwaysLegal();
|
|
|
|
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
|
|
.legalFor({{S64, S32}, {S32, S16}, {S64, S16},
|
|
{S32, S1}, {S64, S1}, {S16, S1}})
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S64)
|
|
.widenScalarToNextPow2(1, 32);
|
|
|
|
// TODO: Split s1->s64 during regbankselect for VALU.
|
|
auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
|
|
.legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
|
|
.lowerIf(typeIs(1, S1))
|
|
.customFor({{S32, S64}, {S64, S64}});
|
|
if (ST.has16BitInsts())
|
|
IToFP.legalFor({{S16, S16}});
|
|
IToFP.clampScalar(1, S32, S64)
|
|
.minScalar(0, S32)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(1);
|
|
|
|
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
|
|
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
|
|
.customFor({{S64, S32}, {S64, S64}})
|
|
.narrowScalarFor({{S64, S16}}, changeTo(0, S32));
|
|
if (ST.has16BitInsts())
|
|
FPToI.legalFor({{S16, S16}});
|
|
else
|
|
FPToI.minScalar(1, S32);
|
|
|
|
FPToI.minScalar(0, S32)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
|
|
.clampScalar(0, S16, S64)
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
|
|
.legalFor({S16, S32})
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
// Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
|
|
getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
|
|
.clampScalar(0, S16, S64)
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
if (ST.has16BitInsts()) {
|
|
getActionDefinitionsBuilder(
|
|
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
|
|
.legalFor({S16, S32, S64})
|
|
.clampScalar(0, S16, S64)
|
|
.scalarize(0);
|
|
} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
|
|
getActionDefinitionsBuilder(
|
|
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
|
|
.legalFor({S32, S64})
|
|
.clampScalar(0, S32, S64)
|
|
.scalarize(0);
|
|
} else {
|
|
getActionDefinitionsBuilder(
|
|
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
|
|
.legalFor({S32})
|
|
.customFor({S64})
|
|
.clampScalar(0, S32, S64)
|
|
.scalarize(0);
|
|
}
|
|
|
|
getActionDefinitionsBuilder(G_PTR_ADD)
|
|
.unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
|
|
.legalIf(all(isPointer(0), sameSize(0, 1)))
|
|
.scalarize(0)
|
|
.scalarSameSizeAs(1, 0);
|
|
|
|
getActionDefinitionsBuilder(G_PTRMASK)
|
|
.legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
|
|
.scalarSameSizeAs(1, 0)
|
|
.scalarize(0);
|
|
|
|
auto &CmpBuilder =
|
|
getActionDefinitionsBuilder(G_ICMP)
|
|
// The compare output type differs based on the register bank of the output,
|
|
// so make both s1 and s32 legal.
|
|
//
|
|
// Scalar compares producing output in scc will be promoted to s32, as that
|
|
// is the allocatable register type that will be needed for the copy from
|
|
// scc. This will be promoted during RegBankSelect, and we assume something
|
|
// before that won't try to use s32 result types.
|
|
//
|
|
// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
|
|
// bank.
|
|
.legalForCartesianProduct(
|
|
{S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
|
|
.legalForCartesianProduct(
|
|
{S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
|
|
if (ST.has16BitInsts()) {
|
|
CmpBuilder.legalFor({{S1, S16}});
|
|
}
|
|
|
|
CmpBuilder
|
|
.widenScalarToNextPow2(1)
|
|
.clampScalar(1, S32, S64)
|
|
.scalarize(0)
|
|
.legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
|
|
|
|
auto &FCmpBuilder =
|
|
getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
|
|
{S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
|
|
|
|
if (ST.hasSALUFloatInsts())
|
|
FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
|
|
|
|
FCmpBuilder
|
|
.widenScalarToNextPow2(1)
|
|
.clampScalar(1, S32, S64)
|
|
.scalarize(0);
|
|
|
|
// FIXME: fpow has a selection pattern that should move to custom lowering.
|
|
auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
|
|
if (ST.has16BitInsts())
|
|
ExpOps.customFor({{S32}, {S16}});
|
|
else
|
|
ExpOps.customFor({S32});
|
|
ExpOps.clampScalar(0, MinScalarFPTy, S32)
|
|
.scalarize(0);
|
|
|
|
getActionDefinitionsBuilder(G_FPOWI)
|
|
.clampScalar(0, MinScalarFPTy, S32)
|
|
.lower();
|
|
|
|
auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
|
|
Log2Ops.customFor({S32});
|
|
if (ST.has16BitInsts())
|
|
Log2Ops.legalFor({S16});
|
|
else
|
|
Log2Ops.customFor({S16});
|
|
Log2Ops.scalarize(0)
|
|
.lower();
|
|
|
|
auto &LogOps =
|
|
getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
|
|
LogOps.customFor({S32, S16});
|
|
LogOps.clampScalar(0, MinScalarFPTy, S32)
|
|
.scalarize(0);
|
|
|
|
// The 64-bit versions produce 32-bit results, but only on the SALU.
|
|
getActionDefinitionsBuilder(G_CTPOP)
|
|
.legalFor({{S32, S32}, {S32, S64}})
|
|
.clampScalar(0, S32, S32)
|
|
.widenScalarToNextPow2(1, 32)
|
|
.clampScalar(1, S32, S64)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0, 32);
|
|
|
|
// If no 16 bit instr is available, lower into different instructions.
|
|
if (ST.has16BitInsts())
|
|
getActionDefinitionsBuilder(G_IS_FPCLASS)
|
|
.legalForCartesianProduct({S1}, FPTypes16)
|
|
.widenScalarToNextPow2(1)
|
|
.scalarize(0)
|
|
.lower();
|
|
else
|
|
getActionDefinitionsBuilder(G_IS_FPCLASS)
|
|
.legalForCartesianProduct({S1}, FPTypesBase)
|
|
.lowerFor({S1, S16})
|
|
.widenScalarToNextPow2(1)
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
// The hardware instructions return a different result on 0 than the generic
|
|
// instructions expect. The hardware produces -1, but these produce the
|
|
// bitwidth.
|
|
getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S32)
|
|
.clampScalar(1, S32, S64)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.widenScalarToNextPow2(1, 32)
|
|
.custom();
|
|
|
|
// The 64-bit versions produce 32-bit results, but only on the SALU.
|
|
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
|
|
.legalFor({{S32, S32}, {S32, S64}})
|
|
.customIf(scalarNarrowerThan(1, 32))
|
|
.clampScalar(0, S32, S32)
|
|
.clampScalar(1, S32, S64)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.widenScalarToNextPow2(1, 32);
|
|
|
|
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
|
|
.legalFor({{S32, S32}, {S32, S64}})
|
|
.clampScalar(0, S32, S32)
|
|
.clampScalar(1, S32, S64)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0, 32)
|
|
.widenScalarToNextPow2(1, 32);
|
|
|
|
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
|
|
// RegBankSelect.
|
|
getActionDefinitionsBuilder(G_BITREVERSE)
|
|
.legalFor({S32, S64})
|
|
.clampScalar(0, S32, S64)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0);
|
|
|
|
if (ST.has16BitInsts()) {
|
|
getActionDefinitionsBuilder(G_BSWAP)
|
|
.legalFor({S16, S32, V2S16})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
// FIXME: Fixing non-power-of-2 before clamp is workaround for
|
|
// narrowScalar limitation.
|
|
.widenScalarToNextPow2(0)
|
|
.clampScalar(0, S16, S32)
|
|
.scalarize(0);
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
|
|
.legalFor({S32, S16, V2S16})
|
|
.clampMaxNumElements(0, S16, 2)
|
|
.minScalar(0, S16)
|
|
.widenScalarToNextPow2(0)
|
|
.scalarize(0)
|
|
.lower();
|
|
} else {
|
|
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
|
|
.legalFor({S32, S16})
|
|
.widenScalarToNextPow2(0)
|
|
.minScalar(0, S16)
|
|
.scalarize(0)
|
|
.lower();
|
|
}
|
|
} else {
|
|
// TODO: Should have same legality without v_perm_b32
|
|
getActionDefinitionsBuilder(G_BSWAP)
|
|
.legalFor({S32})
|
|
.lowerIf(scalarNarrowerThan(0, 32))
|
|
// FIXME: Fixing non-power-of-2 before clamp is workaround for
|
|
// narrowScalar limitation.
|
|
.widenScalarToNextPow2(0)
|
|
.maxScalar(0, S32)
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
|
|
.legalFor({S32})
|
|
.minScalar(0, S32)
|
|
.widenScalarToNextPow2(0)
|
|
.scalarize(0)
|
|
.lower();
|
|
}
|
|
|
|
getActionDefinitionsBuilder(G_INTTOPTR)
|
|
// List the common cases
|
|
.legalForCartesianProduct(AddrSpaces64, {S64})
|
|
.legalForCartesianProduct(AddrSpaces32, {S32})
|
|
.scalarize(0)
|
|
// Accept any address space as long as the size matches
|
|
.legalIf(sameSize(0, 1))
|
|
.widenScalarIf(smallerThan(1, 0),
|
|
[](const LegalityQuery &Query) {
|
|
return std::pair(
|
|
1, LLT::scalar(Query.Types[0].getSizeInBits()));
|
|
})
|
|
.narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
|
|
return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
|
|
});
|
|
|
|
getActionDefinitionsBuilder(G_PTRTOINT)
|
|
// List the common cases
|
|
.legalForCartesianProduct(AddrSpaces64, {S64})
|
|
.legalForCartesianProduct(AddrSpaces32, {S32})
|
|
.scalarize(0)
|
|
// Accept any address space as long as the size matches
|
|
.legalIf(sameSize(0, 1))
|
|
.widenScalarIf(smallerThan(0, 1),
|
|
[](const LegalityQuery &Query) {
|
|
return std::pair(
|
|
0, LLT::scalar(Query.Types[1].getSizeInBits()));
|
|
})
|
|
.narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
|
|
return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
|
|
});
|
|
|
|
getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
|
|
.scalarize(0)
|
|
.custom();
|
|
|
|
const auto needToSplitMemOp = [=](const LegalityQuery &Query,
|
|
bool IsLoad) -> bool {
|
|
const LLT DstTy = Query.Types[0];
|
|
|
|
// Split vector extloads.
|
|
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
|
|
|
|
if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
|
|
return true;
|
|
|
|
const LLT PtrTy = Query.Types[1];
|
|
unsigned AS = PtrTy.getAddressSpace();
|
|
if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
|
|
Query.MMODescrs[0].Ordering !=
|
|
AtomicOrdering::NotAtomic))
|
|
return true;
|
|
|
|
// Catch weird sized loads that don't evenly divide into the access sizes
|
|
// TODO: May be able to widen depending on alignment etc.
|
|
unsigned NumRegs = (MemSize + 31) / 32;
|
|
if (NumRegs == 3) {
|
|
if (!ST.hasDwordx3LoadStores())
|
|
return true;
|
|
} else {
|
|
// If the alignment allows, these should have been widened.
|
|
if (!isPowerOf2_32(NumRegs))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
|
|
unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
|
|
unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
|
|
|
|
// TODO: Refine based on subtargets which support unaligned access or 128-bit
|
|
// LDS
|
|
// TODO: Unsupported flat for SI.
|
|
|
|
for (unsigned Op : {G_LOAD, G_STORE}) {
|
|
const bool IsStore = Op == G_STORE;
|
|
|
|
auto &Actions = getActionDefinitionsBuilder(Op);
|
|
// Explicitly list some common cases.
|
|
// TODO: Does this help compile time at all?
|
|
Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
|
|
{V2S32, GlobalPtr, V2S32, GlobalAlign32},
|
|
{V4S32, GlobalPtr, V4S32, GlobalAlign32},
|
|
{S64, GlobalPtr, S64, GlobalAlign32},
|
|
{V2S64, GlobalPtr, V2S64, GlobalAlign32},
|
|
{V2S16, GlobalPtr, V2S16, GlobalAlign32},
|
|
{S32, GlobalPtr, S8, GlobalAlign8},
|
|
{S32, GlobalPtr, S16, GlobalAlign16},
|
|
|
|
{S32, LocalPtr, S32, 32},
|
|
{S64, LocalPtr, S64, 32},
|
|
{V2S32, LocalPtr, V2S32, 32},
|
|
{S32, LocalPtr, S8, 8},
|
|
{S32, LocalPtr, S16, 16},
|
|
{V2S16, LocalPtr, S32, 32},
|
|
|
|
{S32, PrivatePtr, S32, 32},
|
|
{S32, PrivatePtr, S8, 8},
|
|
{S32, PrivatePtr, S16, 16},
|
|
{V2S16, PrivatePtr, S32, 32},
|
|
|
|
{S32, ConstantPtr, S32, GlobalAlign32},
|
|
{V2S32, ConstantPtr, V2S32, GlobalAlign32},
|
|
{V4S32, ConstantPtr, V4S32, GlobalAlign32},
|
|
{S64, ConstantPtr, S64, GlobalAlign32},
|
|
{V2S32, ConstantPtr, V2S32, GlobalAlign32}});
|
|
Actions.legalIf(
|
|
[=](const LegalityQuery &Query) -> bool {
|
|
return isLoadStoreLegal(ST, Query);
|
|
});
|
|
|
|
// The custom pointers (fat pointers, buffer resources) don't work with load
|
|
// and store at this level. Fat pointers should have been lowered to
|
|
// intrinsics before the translation to MIR.
|
|
Actions.unsupportedIf(
|
|
typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
|
|
|
|
// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
|
|
// ptrtoint. This is needed to account for the fact that we can't have i128
|
|
// as a register class for SelectionDAG reasons.
|
|
Actions.customIf([=](const LegalityQuery &Query) -> bool {
|
|
return hasBufferRsrcWorkaround(Query.Types[0]);
|
|
});
|
|
|
|
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
|
|
// 64-bits.
|
|
//
|
|
// TODO: Should generalize bitcast action into coerce, which will also cover
|
|
// inserting addrspacecasts.
|
|
Actions.customIf(typeIs(1, Constant32Ptr));
|
|
|
|
// Turn any illegal element vectors into something easier to deal
|
|
// with. These will ultimately produce 32-bit scalar shifts to extract the
|
|
// parts anyway.
|
|
//
|
|
// For odd 16-bit element vectors, prefer to split those into pieces with
|
|
// 16-bit vector parts.
|
|
Actions.bitcastIf(
|
|
[=](const LegalityQuery &Query) -> bool {
|
|
return shouldBitcastLoadStoreType(ST, Query.Types[0],
|
|
Query.MMODescrs[0].MemoryTy);
|
|
}, bitcastToRegisterType(0));
|
|
|
|
if (!IsStore) {
|
|
// Widen suitably aligned loads by loading extra bytes. The standard
|
|
// legalization actions can't properly express widening memory operands.
|
|
Actions.customIf([=](const LegalityQuery &Query) -> bool {
|
|
return shouldWidenLoad(ST, Query, G_LOAD);
|
|
});
|
|
}
|
|
|
|
// FIXME: load/store narrowing should be moved to lower action
|
|
Actions
|
|
.narrowScalarIf(
|
|
[=](const LegalityQuery &Query) -> bool {
|
|
return !Query.Types[0].isVector() &&
|
|
needToSplitMemOp(Query, Op == G_LOAD);
|
|
},
|
|
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
|
|
const LLT DstTy = Query.Types[0];
|
|
const LLT PtrTy = Query.Types[1];
|
|
|
|
const unsigned DstSize = DstTy.getSizeInBits();
|
|
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
|
|
|
|
// Split extloads.
|
|
if (DstSize > MemSize)
|
|
return std::pair(0, LLT::scalar(MemSize));
|
|
|
|
unsigned MaxSize = maxSizeForAddrSpace(
|
|
ST, PtrTy.getAddressSpace(), Op == G_LOAD,
|
|
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
|
|
if (MemSize > MaxSize)
|
|
return std::pair(0, LLT::scalar(MaxSize));
|
|
|
|
uint64_t Align = Query.MMODescrs[0].AlignInBits;
|
|
return std::pair(0, LLT::scalar(Align));
|
|
})
|
|
.fewerElementsIf(
|
|
[=](const LegalityQuery &Query) -> bool {
|
|
return Query.Types[0].isVector() &&
|
|
needToSplitMemOp(Query, Op == G_LOAD);
|
|
},
|
|
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
|
|
const LLT DstTy = Query.Types[0];
|
|
const LLT PtrTy = Query.Types[1];
|
|
|
|
LLT EltTy = DstTy.getElementType();
|
|
unsigned MaxSize = maxSizeForAddrSpace(
|
|
ST, PtrTy.getAddressSpace(), Op == G_LOAD,
|
|
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
|
|
|
|
// FIXME: Handle widened to power of 2 results better. This ends
|
|
// up scalarizing.
|
|
// FIXME: 3 element stores scalarized on SI
|
|
|
|
// Split if it's too large for the address space.
|
|
unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
|
|
if (MemSize > MaxSize) {
|
|
unsigned NumElts = DstTy.getNumElements();
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
|
|
if (MaxSize % EltSize == 0) {
|
|
return std::pair(
|
|
0, LLT::scalarOrVector(
|
|
ElementCount::getFixed(MaxSize / EltSize), EltTy));
|
|
}
|
|
|
|
unsigned NumPieces = MemSize / MaxSize;
|
|
|
|
// FIXME: Refine when odd breakdowns handled
|
|
// The scalars will need to be re-legalized.
|
|
if (NumPieces == 1 || NumPieces >= NumElts ||
|
|
NumElts % NumPieces != 0)
|
|
return std::pair(0, EltTy);
|
|
|
|
return std::pair(0,
|
|
LLT::fixed_vector(NumElts / NumPieces, EltTy));
|
|
}
|
|
|
|
// FIXME: We could probably handle weird extending loads better.
|
|
if (DstTy.getSizeInBits() > MemSize)
|
|
return std::pair(0, EltTy);
|
|
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
unsigned DstSize = DstTy.getSizeInBits();
|
|
if (!isPowerOf2_32(DstSize)) {
|
|
// We're probably decomposing an odd sized store. Try to split
|
|
// to the widest type. TODO: Account for alignment. As-is it
|
|
// should be OK, since the new parts will be further legalized.
|
|
unsigned FloorSize = llvm::bit_floor(DstSize);
|
|
return std::pair(
|
|
0, LLT::scalarOrVector(
|
|
ElementCount::getFixed(FloorSize / EltSize), EltTy));
|
|
}
|
|
|
|
// May need relegalization for the scalars.
|
|
return std::pair(0, EltTy);
|
|
})
|
|
.minScalar(0, S32)
|
|
.narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
|
|
.widenScalarToNextPow2(0)
|
|
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
|
|
.lower();
|
|
}
|
|
|
|
// FIXME: Unaligned accesses not lowered.
|
|
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
|
|
.legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
|
|
{S32, GlobalPtr, S16, 2 * 8},
|
|
{S32, LocalPtr, S8, 8},
|
|
{S32, LocalPtr, S16, 16},
|
|
{S32, PrivatePtr, S8, 8},
|
|
{S32, PrivatePtr, S16, 16},
|
|
{S32, ConstantPtr, S8, 8},
|
|
{S32, ConstantPtr, S16, 2 * 8}})
|
|
.legalIf(
|
|
[=](const LegalityQuery &Query) -> bool {
|
|
return isLoadStoreLegal(ST, Query);
|
|
});
|
|
|
|
if (ST.hasFlatAddressSpace()) {
|
|
ExtLoads.legalForTypesWithMemDesc(
|
|
{{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
|
|
}
|
|
|
|
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
|
|
// 64-bits.
|
|
//
|
|
// TODO: Should generalize bitcast action into coerce, which will also cover
|
|
// inserting addrspacecasts.
|
|
ExtLoads.customIf(typeIs(1, Constant32Ptr));
|
|
|
|
ExtLoads.clampScalar(0, S32, S32)
|
|
.widenScalarToNextPow2(0)
|
|
.lower();
|
|
|
|
auto &Atomics = getActionDefinitionsBuilder(
|
|
{G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
|
|
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
|
|
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
|
|
G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
|
|
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
|
|
{S64, GlobalPtr}, {S64, LocalPtr},
|
|
{S32, RegionPtr}, {S64, RegionPtr}});
|
|
if (ST.hasFlatAddressSpace()) {
|
|
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
|
|
}
|
|
|
|
// TODO: v2bf16 operations, and fat buffer pointer support.
|
|
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
|
|
if (ST.hasLDSFPAtomicAddF32()) {
|
|
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
|
|
if (ST.hasLdsAtomicAddF64())
|
|
Atomic.legalFor({{S64, LocalPtr}});
|
|
if (ST.hasAtomicDsPkAdd16Insts())
|
|
Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
|
|
}
|
|
if (ST.hasAtomicFaddInsts())
|
|
Atomic.legalFor({{S32, GlobalPtr}});
|
|
if (ST.hasFlatAtomicFaddF32Inst())
|
|
Atomic.legalFor({{S32, FlatPtr}});
|
|
|
|
if (ST.hasGFX90AInsts()) {
|
|
// These are legal with some caveats, and should have undergone expansion in
|
|
// the IR in most situations
|
|
// TODO: Move atomic expansion into legalizer
|
|
Atomic.legalFor({
|
|
{S32, GlobalPtr},
|
|
{S64, GlobalPtr},
|
|
{S64, FlatPtr}
|
|
});
|
|
}
|
|
|
|
if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
|
|
ST.hasAtomicBufferGlobalPkAddF16Insts())
|
|
Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
|
|
if (ST.hasAtomicGlobalPkAddBF16Inst())
|
|
Atomic.legalFor({{V2BF16, GlobalPtr}});
|
|
if (ST.hasAtomicFlatPkAdd16Insts())
|
|
Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
|
|
|
|
|
|
// Most of the legalization work here is done by AtomicExpand. We could
|
|
// probably use a simpler legality rule that just assumes anything is OK.
|
|
auto &AtomicFMinFMax =
|
|
getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
|
|
.legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
|
|
|
|
if (ST.hasAtomicFMinFMaxF32GlobalInsts())
|
|
AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
|
|
if (ST.hasAtomicFMinFMaxF64GlobalInsts())
|
|
AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
|
|
if (ST.hasAtomicFMinFMaxF32FlatInsts())
|
|
AtomicFMinFMax.legalFor({F32, FlatPtr});
|
|
if (ST.hasAtomicFMinFMaxF64FlatInsts())
|
|
AtomicFMinFMax.legalFor({F64, FlatPtr});
|
|
|
|
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
|
|
// demarshalling
|
|
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
|
|
.customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
|
|
{S32, FlatPtr}, {S64, FlatPtr}})
|
|
.legalFor({{S32, LocalPtr}, {S64, LocalPtr},
|
|
{S32, RegionPtr}, {S64, RegionPtr}});
|
|
// TODO: Pointer types, any 32-bit or 64-bit vector
|
|
|
|
// Condition should be s32 for scalar, s1 for vector.
|
|
getActionDefinitionsBuilder(G_SELECT)
|
|
.legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
|
|
LocalPtr, FlatPtr, PrivatePtr,
|
|
LLT::fixed_vector(2, LocalPtr),
|
|
LLT::fixed_vector(2, PrivatePtr)},
|
|
{S1, S32})
|
|
.clampScalar(0, S16, S64)
|
|
.scalarize(1)
|
|
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
|
|
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
|
|
.clampMaxNumElements(0, S32, 2)
|
|
.clampMaxNumElements(0, LocalPtr, 2)
|
|
.clampMaxNumElements(0, PrivatePtr, 2)
|
|
.scalarize(0)
|
|
.widenScalarToNextPow2(0)
|
|
.legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
|
|
|
|
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
|
|
// be more flexible with the shift amount type.
|
|
auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
|
|
.legalFor({{S32, S32}, {S64, S32}});
|
|
if (ST.has16BitInsts()) {
|
|
if (ST.hasVOP3PInsts()) {
|
|
Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
|
|
.clampMaxNumElements(0, S16, 2);
|
|
} else
|
|
Shifts.legalFor({{S16, S16}});
|
|
|
|
// TODO: Support 16-bit shift amounts for all types
|
|
Shifts.widenScalarIf(
|
|
[=](const LegalityQuery &Query) {
|
|
// Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
|
|
// 32-bit amount.
|
|
const LLT ValTy = Query.Types[0];
|
|
const LLT AmountTy = Query.Types[1];
|
|
return ValTy.getSizeInBits() <= 16 &&
|
|
AmountTy.getSizeInBits() < 16;
|
|
}, changeTo(1, S16));
|
|
Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
|
|
Shifts.clampScalar(1, S32, S32);
|
|
Shifts.widenScalarToNextPow2(0, 16);
|
|
Shifts.clampScalar(0, S16, S64);
|
|
|
|
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
|
|
.minScalar(0, S16)
|
|
.scalarize(0)
|
|
.lower();
|
|
} else {
|
|
// Make sure we legalize the shift amount type first, as the general
|
|
// expansion for the shifted type will produce much worse code if it hasn't
|
|
// been truncated already.
|
|
Shifts.clampScalar(1, S32, S32);
|
|
Shifts.widenScalarToNextPow2(0, 32);
|
|
Shifts.clampScalar(0, S32, S64);
|
|
|
|
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
|
|
.minScalar(0, S32)
|
|
.scalarize(0)
|
|
.lower();
|
|
}
|
|
Shifts.scalarize(0);
|
|
|
|
for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
|
|
unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
|
|
unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
|
|
unsigned IdxTypeIdx = 2;
|
|
|
|
getActionDefinitionsBuilder(Op)
|
|
.customIf([=](const LegalityQuery &Query) {
|
|
const LLT EltTy = Query.Types[EltTypeIdx];
|
|
const LLT VecTy = Query.Types[VecTypeIdx];
|
|
const LLT IdxTy = Query.Types[IdxTypeIdx];
|
|
const unsigned EltSize = EltTy.getSizeInBits();
|
|
const bool isLegalVecType =
|
|
!!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
|
|
// Address space 8 pointers are 128-bit wide values, but the logic
|
|
// below will try to bitcast them to 2N x s64, which will fail.
|
|
// Therefore, as an intermediate step, wrap extracts/insertions from a
|
|
// ptrtoint-ing the vector and scalar arguments (or inttoptring the
|
|
// extraction result) in order to produce a vector operation that can
|
|
// be handled by the logic below.
|
|
if (EltTy.isPointer() && EltSize > 64)
|
|
return true;
|
|
return (EltSize == 32 || EltSize == 64) &&
|
|
VecTy.getSizeInBits() % 32 == 0 &&
|
|
VecTy.getSizeInBits() <= MaxRegisterSize &&
|
|
IdxTy.getSizeInBits() == 32 &&
|
|
isLegalVecType;
|
|
})
|
|
.bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
|
|
bitcastToVectorElement32(VecTypeIdx))
|
|
//.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
|
|
.bitcastIf(
|
|
all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
|
|
[=](const LegalityQuery &Query) {
|
|
// For > 64-bit element types, try to turn this into a 64-bit
|
|
// element vector since we may be able to do better indexing
|
|
// if this is scalar. If not, fall back to 32.
|
|
const LLT EltTy = Query.Types[EltTypeIdx];
|
|
const LLT VecTy = Query.Types[VecTypeIdx];
|
|
const unsigned DstEltSize = EltTy.getSizeInBits();
|
|
const unsigned VecSize = VecTy.getSizeInBits();
|
|
|
|
const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
|
|
return std::pair(
|
|
VecTypeIdx,
|
|
LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
|
|
})
|
|
.clampScalar(EltTypeIdx, S32, S64)
|
|
.clampScalar(VecTypeIdx, S32, S64)
|
|
.clampScalar(IdxTypeIdx, S32, S32)
|
|
.clampMaxNumElements(VecTypeIdx, S32, 32)
|
|
// TODO: Clamp elements for 64-bit vectors?
|
|
.moreElementsIf(
|
|
isIllegalRegisterType(VecTypeIdx),
|
|
moreElementsToNextExistingRegClass(VecTypeIdx))
|
|
// It should only be necessary with variable indexes.
|
|
// As a last resort, lower to the stack
|
|
.lower();
|
|
}
|
|
|
|
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
|
|
.unsupportedIf([=](const LegalityQuery &Query) {
|
|
const LLT &EltTy = Query.Types[1].getElementType();
|
|
return Query.Types[0] != EltTy;
|
|
});
|
|
|
|
for (unsigned Op : {G_EXTRACT, G_INSERT}) {
|
|
unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
|
|
unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
|
|
|
|
// FIXME: Doesn't handle extract of illegal sizes.
|
|
getActionDefinitionsBuilder(Op)
|
|
.lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
|
|
.lowerIf([=](const LegalityQuery &Query) {
|
|
// Sub-vector(or single element) insert and extract.
|
|
// TODO: verify immediate offset here since lower only works with
|
|
// whole elements.
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
return BigTy.isVector();
|
|
})
|
|
// FIXME: Multiples of 16 should not be legal.
|
|
.legalIf([=](const LegalityQuery &Query) {
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
const LLT LitTy = Query.Types[LitTyIdx];
|
|
return (BigTy.getSizeInBits() % 32 == 0) &&
|
|
(LitTy.getSizeInBits() % 16 == 0);
|
|
})
|
|
.widenScalarIf(
|
|
[=](const LegalityQuery &Query) {
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
return (BigTy.getScalarSizeInBits() < 16);
|
|
},
|
|
LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
|
|
.widenScalarIf(
|
|
[=](const LegalityQuery &Query) {
|
|
const LLT LitTy = Query.Types[LitTyIdx];
|
|
return (LitTy.getScalarSizeInBits() < 16);
|
|
},
|
|
LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
|
|
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
|
|
.widenScalarToNextPow2(BigTyIdx, 32);
|
|
|
|
}
|
|
|
|
auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
|
|
.legalForCartesianProduct(AllS32Vectors, {S32})
|
|
.legalForCartesianProduct(AllS64Vectors, {S64})
|
|
.clampNumElements(0, V16S32, V32S32)
|
|
.clampNumElements(0, V2S64, V16S64)
|
|
.fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
|
|
.moreElementsIf(
|
|
isIllegalRegisterType(0),
|
|
moreElementsToNextExistingRegClass(0));
|
|
|
|
if (ST.hasScalarPackInsts()) {
|
|
BuildVector
|
|
// FIXME: Should probably widen s1 vectors straight to s32
|
|
.minScalarOrElt(0, S16)
|
|
.minScalar(1, S16);
|
|
|
|
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
|
|
.legalFor({V2S16, S32})
|
|
.lower();
|
|
} else {
|
|
BuildVector.customFor({V2S16, S16});
|
|
BuildVector.minScalarOrElt(0, S32);
|
|
|
|
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
|
|
.customFor({V2S16, S32})
|
|
.lower();
|
|
}
|
|
|
|
BuildVector.legalIf(isRegisterType(0));
|
|
|
|
// FIXME: Clamp maximum size
|
|
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
|
|
.legalIf(all(isRegisterType(0), isRegisterType(1)))
|
|
.clampMaxNumElements(0, S32, 32)
|
|
.clampMaxNumElements(1, S16, 2) // TODO: Make 4?
|
|
.clampMaxNumElements(0, S16, 64);
|
|
|
|
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
|
|
|
|
// Merge/Unmerge
|
|
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
|
|
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
|
|
unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
|
|
|
|
auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
|
|
const LLT Ty = Query.Types[TypeIdx];
|
|
if (Ty.isVector()) {
|
|
const LLT &EltTy = Ty.getElementType();
|
|
if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
|
|
return true;
|
|
if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
auto &Builder = getActionDefinitionsBuilder(Op)
|
|
.legalIf(all(isRegisterType(0), isRegisterType(1)))
|
|
.lowerFor({{S16, V2S16}})
|
|
.lowerIf([=](const LegalityQuery &Query) {
|
|
const LLT BigTy = Query.Types[BigTyIdx];
|
|
return BigTy.getSizeInBits() == 32;
|
|
})
|
|
// Try to widen to s16 first for small types.
|
|
// TODO: Only do this on targets with legal s16 shifts
|
|
.minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
|
|
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
|
|
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
|
|
.fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
|
|
elementTypeIs(1, S16)),
|
|
changeTo(1, V2S16))
|
|
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
|
|
// worth considering the multiples of 64 since 2*192 and 2*384 are not
|
|
// valid.
|
|
.clampScalar(LitTyIdx, S32, S512)
|
|
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
|
|
// Break up vectors with weird elements into scalars
|
|
.fewerElementsIf(
|
|
[=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
|
|
scalarize(0))
|
|
.fewerElementsIf(
|
|
[=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
|
|
scalarize(1))
|
|
.clampScalar(BigTyIdx, S32, MaxScalar);
|
|
|
|
if (Op == G_MERGE_VALUES) {
|
|
Builder.widenScalarIf(
|
|
// TODO: Use 16-bit shifts if legal for 8-bit values?
|
|
[=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[LitTyIdx];
|
|
return Ty.getSizeInBits() < 32;
|
|
},
|
|
changeTo(LitTyIdx, S32));
|
|
}
|
|
|
|
Builder.widenScalarIf(
|
|
[=](const LegalityQuery &Query) {
|
|
const LLT Ty = Query.Types[BigTyIdx];
|
|
return Ty.getSizeInBits() % 16 != 0;
|
|
},
|
|
[=](const LegalityQuery &Query) {
|
|
// Pick the next power of 2, or a multiple of 64 over 128.
|
|
// Whichever is smaller.
|
|
const LLT &Ty = Query.Types[BigTyIdx];
|
|
unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
|
|
if (NewSizeInBits >= 256) {
|
|
unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
|
|
if (RoundedTo < NewSizeInBits)
|
|
NewSizeInBits = RoundedTo;
|
|
}
|
|
return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
|
|
})
|
|
// Any vectors left are the wrong size. Scalarize them.
|
|
.scalarize(0)
|
|
.scalarize(1);
|
|
}
|
|
|
|
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
|
|
// RegBankSelect.
|
|
auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
|
|
.legalFor({{S32}, {S64}});
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
SextInReg.lowerFor({{V2S16}})
|
|
// Prefer to reduce vector widths for 16-bit vectors before lowering, to
|
|
// get more vector shift opportunities, since we'll get those when
|
|
// expanded.
|
|
.clampMaxNumElementsStrict(0, S16, 2);
|
|
} else if (ST.has16BitInsts()) {
|
|
SextInReg.lowerFor({{S32}, {S64}, {S16}});
|
|
} else {
|
|
// Prefer to promote to s32 before lowering if we don't have 16-bit
|
|
// shifts. This avoid a lot of intermediate truncate and extend operations.
|
|
SextInReg.lowerFor({{S32}, {S64}});
|
|
}
|
|
|
|
SextInReg
|
|
.scalarize(0)
|
|
.clampScalar(0, S32, S64)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder({G_ROTR, G_ROTL})
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
// TODO: Only Try to form v2s16 with legal packed instructions.
|
|
getActionDefinitionsBuilder(G_FSHR)
|
|
.legalFor({{S32, S32}})
|
|
.lowerFor({{V2S16, V2S16}})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.lower();
|
|
|
|
if (ST.hasVOP3PInsts()) {
|
|
getActionDefinitionsBuilder(G_FSHL)
|
|
.lowerFor({{V2S16, V2S16}})
|
|
.clampMaxNumElementsStrict(0, S16, 2)
|
|
.scalarize(0)
|
|
.lower();
|
|
} else {
|
|
getActionDefinitionsBuilder(G_FSHL)
|
|
.scalarize(0)
|
|
.lower();
|
|
}
|
|
|
|
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
|
|
.legalFor({S64});
|
|
|
|
getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
|
|
|
|
getActionDefinitionsBuilder(G_FENCE)
|
|
.alwaysLegal();
|
|
|
|
getActionDefinitionsBuilder({G_SMULO, G_UMULO})
|
|
.scalarize(0)
|
|
.minScalar(0, S32)
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
|
|
.legalFor({{S32, S32}, {S64, S32}})
|
|
.clampScalar(1, S32, S32)
|
|
.clampScalar(0, S32, S64)
|
|
.widenScalarToNextPow2(0)
|
|
.scalarize(0);
|
|
|
|
getActionDefinitionsBuilder(
|
|
{// TODO: Verify V_BFI_B32 is generated from expanded bit ops
|
|
G_FCOPYSIGN,
|
|
|
|
G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
|
|
G_READ_REGISTER, G_WRITE_REGISTER,
|
|
|
|
G_SADDO, G_SSUBO})
|
|
.lower();
|
|
|
|
if (ST.hasIEEEMinMax()) {
|
|
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
|
|
.legalFor(FPTypesPK16)
|
|
.clampMaxNumElements(0, S16, 2)
|
|
.scalarize(0);
|
|
} else {
|
|
// TODO: Implement
|
|
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
|
|
}
|
|
|
|
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
|
|
.lower();
|
|
|
|
getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
|
|
|
|
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
|
|
G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
|
|
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
|
|
.unsupported();
|
|
|
|
getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
|
|
|
|
getLegacyLegalizerInfo().computeTables();
|
|
verify(*ST.getInstrInfo());
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeCustom(
|
|
LegalizerHelper &Helper, MachineInstr &MI,
|
|
LostDebugLocObserver &LocObserver) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
switch (MI.getOpcode()) {
|
|
case TargetOpcode::G_ADDRSPACE_CAST:
|
|
return legalizeAddrSpaceCast(MI, MRI, B);
|
|
case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
|
|
return legalizeFroundeven(MI, MRI, B);
|
|
case TargetOpcode::G_FCEIL:
|
|
return legalizeFceil(MI, MRI, B);
|
|
case TargetOpcode::G_FREM:
|
|
return legalizeFrem(MI, MRI, B);
|
|
case TargetOpcode::G_INTRINSIC_TRUNC:
|
|
return legalizeIntrinsicTrunc(MI, MRI, B);
|
|
case TargetOpcode::G_SITOFP:
|
|
return legalizeITOFP(MI, MRI, B, true);
|
|
case TargetOpcode::G_UITOFP:
|
|
return legalizeITOFP(MI, MRI, B, false);
|
|
case TargetOpcode::G_FPTOSI:
|
|
return legalizeFPTOI(MI, MRI, B, true);
|
|
case TargetOpcode::G_FPTOUI:
|
|
return legalizeFPTOI(MI, MRI, B, false);
|
|
case TargetOpcode::G_FMINNUM:
|
|
case TargetOpcode::G_FMAXNUM:
|
|
case TargetOpcode::G_FMINNUM_IEEE:
|
|
case TargetOpcode::G_FMAXNUM_IEEE:
|
|
return legalizeMinNumMaxNum(Helper, MI);
|
|
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
|
|
return legalizeExtractVectorElt(MI, MRI, B);
|
|
case TargetOpcode::G_INSERT_VECTOR_ELT:
|
|
return legalizeInsertVectorElt(MI, MRI, B);
|
|
case TargetOpcode::G_FSIN:
|
|
case TargetOpcode::G_FCOS:
|
|
return legalizeSinCos(MI, MRI, B);
|
|
case TargetOpcode::G_GLOBAL_VALUE:
|
|
return legalizeGlobalValue(MI, MRI, B);
|
|
case TargetOpcode::G_LOAD:
|
|
case TargetOpcode::G_SEXTLOAD:
|
|
case TargetOpcode::G_ZEXTLOAD:
|
|
return legalizeLoad(Helper, MI);
|
|
case TargetOpcode::G_STORE:
|
|
return legalizeStore(Helper, MI);
|
|
case TargetOpcode::G_FMAD:
|
|
return legalizeFMad(MI, MRI, B);
|
|
case TargetOpcode::G_FDIV:
|
|
return legalizeFDIV(MI, MRI, B);
|
|
case TargetOpcode::G_FFREXP:
|
|
return legalizeFFREXP(MI, MRI, B);
|
|
case TargetOpcode::G_FSQRT:
|
|
return legalizeFSQRT(MI, MRI, B);
|
|
case TargetOpcode::G_UDIV:
|
|
case TargetOpcode::G_UREM:
|
|
case TargetOpcode::G_UDIVREM:
|
|
return legalizeUnsignedDIV_REM(MI, MRI, B);
|
|
case TargetOpcode::G_SDIV:
|
|
case TargetOpcode::G_SREM:
|
|
case TargetOpcode::G_SDIVREM:
|
|
return legalizeSignedDIV_REM(MI, MRI, B);
|
|
case TargetOpcode::G_ATOMIC_CMPXCHG:
|
|
return legalizeAtomicCmpXChg(MI, MRI, B);
|
|
case TargetOpcode::G_FLOG2:
|
|
return legalizeFlog2(MI, B);
|
|
case TargetOpcode::G_FLOG:
|
|
case TargetOpcode::G_FLOG10:
|
|
return legalizeFlogCommon(MI, B);
|
|
case TargetOpcode::G_FEXP2:
|
|
return legalizeFExp2(MI, B);
|
|
case TargetOpcode::G_FEXP:
|
|
case TargetOpcode::G_FEXP10:
|
|
return legalizeFExp(MI, B);
|
|
case TargetOpcode::G_FPOW:
|
|
return legalizeFPow(MI, B);
|
|
case TargetOpcode::G_FFLOOR:
|
|
return legalizeFFloor(MI, MRI, B);
|
|
case TargetOpcode::G_BUILD_VECTOR:
|
|
case TargetOpcode::G_BUILD_VECTOR_TRUNC:
|
|
return legalizeBuildVector(MI, MRI, B);
|
|
case TargetOpcode::G_MUL:
|
|
return legalizeMul(Helper, MI);
|
|
case TargetOpcode::G_CTLZ:
|
|
case TargetOpcode::G_CTTZ:
|
|
return legalizeCTLZ_CTTZ(MI, MRI, B);
|
|
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
|
|
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
|
|
case TargetOpcode::G_STACKSAVE:
|
|
return legalizeStackSave(MI, B);
|
|
case TargetOpcode::G_GET_FPENV:
|
|
return legalizeGetFPEnv(MI, MRI, B);
|
|
case TargetOpcode::G_SET_FPENV:
|
|
return legalizeSetFPEnv(MI, MRI, B);
|
|
case TargetOpcode::G_TRAP:
|
|
return legalizeTrap(MI, MRI, B);
|
|
case TargetOpcode::G_DEBUGTRAP:
|
|
return legalizeDebugTrap(MI, MRI, B);
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
llvm_unreachable("expected switch to return");
|
|
}
|
|
|
|
Register AMDGPULegalizerInfo::getSegmentAperture(
|
|
unsigned AS,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
MachineFunction &MF = B.getMF();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
|
|
|
|
if (ST.hasApertureRegs()) {
|
|
// Note: this register is somewhat broken. When used as a 32-bit operand,
|
|
// it only returns zeroes. The real value is in the upper 32 bits.
|
|
// Thus, we must emit extract the high 32 bits.
|
|
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
|
|
? AMDGPU::SRC_SHARED_BASE
|
|
: AMDGPU::SRC_PRIVATE_BASE;
|
|
// FIXME: It would be more natural to emit a COPY here, but then copy
|
|
// coalescing would kick in and it would think it's okay to use the "HI"
|
|
// subregister (instead of extracting the HI 32 bits) which is an artificial
|
|
// (unusable) register.
|
|
// Register TableGen definitions would need an overhaul to get rid of the
|
|
// artificial "HI" aperture registers and prevent this kind of issue from
|
|
// happening.
|
|
Register Dst = MRI.createGenericVirtualRegister(S64);
|
|
MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
|
|
B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
|
|
return B.buildUnmerge(S32, Dst).getReg(1);
|
|
}
|
|
|
|
// TODO: can we be smarter about machine pointer info?
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
|
Register LoadAddr = MRI.createGenericVirtualRegister(
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
// For code object version 5, private_base and shared_base are passed through
|
|
// implicit kernargs.
|
|
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
|
|
AMDGPU::AMDHSA_COV5) {
|
|
AMDGPUTargetLowering::ImplicitParameter Param =
|
|
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
|
|
: AMDGPUTargetLowering::PRIVATE_BASE;
|
|
uint64_t Offset =
|
|
ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
|
|
|
|
Register KernargPtrReg = MRI.createGenericVirtualRegister(
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
|
|
if (!loadInputValue(KernargPtrReg, B,
|
|
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
|
|
return Register();
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
PtrInfo,
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant,
|
|
LLT::scalar(32), commonAlignment(Align(64), Offset));
|
|
|
|
// Pointer address
|
|
B.buildPtrAdd(LoadAddr, KernargPtrReg,
|
|
B.buildConstant(LLT::scalar(64), Offset).getReg(0));
|
|
// Load address
|
|
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
|
|
}
|
|
|
|
Register QueuePtr = MRI.createGenericVirtualRegister(
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
|
|
if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
|
|
return Register();
|
|
|
|
// Offset into amd_queue_t for group_segment_aperture_base_hi /
|
|
// private_segment_aperture_base_hi.
|
|
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
|
|
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
PtrInfo,
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant,
|
|
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
|
|
|
|
B.buildPtrAdd(LoadAddr, QueuePtr,
|
|
B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
|
|
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
|
|
}
|
|
|
|
/// Return true if the value is a known valid address, such that a null check is
|
|
/// not necessary.
|
|
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
|
|
const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
|
|
MachineInstr *Def = MRI.getVRegDef(Val);
|
|
switch (Def->getOpcode()) {
|
|
case AMDGPU::G_FRAME_INDEX:
|
|
case AMDGPU::G_GLOBAL_VALUE:
|
|
case AMDGPU::G_BLOCK_ADDR:
|
|
return true;
|
|
case AMDGPU::G_CONSTANT: {
|
|
const ConstantInt *CI = Def->getOperand(1).getCImm();
|
|
return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
// MI can either be a G_ADDRSPACE_CAST or a
|
|
// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
|
|
assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
|
|
(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
|
|
Intrinsic::amdgcn_addrspacecast_nonnull));
|
|
|
|
const LLT S32 = LLT::scalar(32);
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
|
|
: MI.getOperand(1).getReg();
|
|
LLT DstTy = MRI.getType(Dst);
|
|
LLT SrcTy = MRI.getType(Src);
|
|
unsigned DestAS = DstTy.getAddressSpace();
|
|
unsigned SrcAS = SrcTy.getAddressSpace();
|
|
|
|
// TODO: Avoid reloading from the queue ptr for each cast, or at least each
|
|
// vector element.
|
|
assert(!DstTy.isVector());
|
|
|
|
const AMDGPUTargetMachine &TM
|
|
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
|
|
|
|
if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
|
|
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
|
|
return true;
|
|
}
|
|
|
|
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
|
|
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
|
|
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
|
|
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
|
|
// G_ADDRSPACE_CAST we need to guess.
|
|
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
|
|
// Extract low 32-bits of the pointer.
|
|
B.buildExtract(Dst, Src, 0);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
unsigned NullVal = TM.getNullPointerValue(DestAS);
|
|
|
|
auto SegmentNull = B.buildConstant(DstTy, NullVal);
|
|
auto FlatNull = B.buildConstant(SrcTy, 0);
|
|
|
|
// Extract low 32-bits of the pointer.
|
|
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
|
|
|
|
auto CmpRes =
|
|
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
|
|
B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
|
|
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
|
|
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
|
|
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
|
|
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
|
|
if (!ApertureReg.isValid())
|
|
return false;
|
|
|
|
// Coerce the type of the low half of the result so we can use
|
|
// merge_values.
|
|
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
|
|
|
|
// TODO: Should we allow mismatched types but matching sizes in merges to
|
|
// avoid the ptrtoint?
|
|
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
|
|
};
|
|
|
|
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
|
|
// G_ADDRSPACE_CAST we need to guess.
|
|
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
|
|
castLocalOrPrivateToFlat(Dst);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
|
|
|
|
auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
|
|
auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
|
|
|
|
auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
|
|
SegmentNull.getReg(0));
|
|
|
|
B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
|
|
SrcTy.getSizeInBits() == 64) {
|
|
// Truncate.
|
|
B.buildExtract(Dst, Src, 0);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
|
|
DstTy.getSizeInBits() == 64) {
|
|
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
|
|
auto PtrLo = B.buildPtrToInt(S32, Src);
|
|
auto HighAddr = B.buildConstant(S32, AddrHiVal);
|
|
B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
|
|
MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
|
|
|
|
LLVMContext &Ctx = MF.getFunction().getContext();
|
|
Ctx.diagnose(InvalidAddrSpaceCast);
|
|
B.buildUndef(Dst);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Src = MI.getOperand(1).getReg();
|
|
LLT Ty = MRI.getType(Src);
|
|
assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
|
|
|
|
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
|
|
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
|
|
|
|
auto C1 = B.buildFConstant(Ty, C1Val);
|
|
auto CopySign = B.buildFCopysign(Ty, C1, Src);
|
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
|
|
auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
|
|
|
|
auto C2 = B.buildFConstant(Ty, C2Val);
|
|
auto Fabs = B.buildFAbs(Ty, Src);
|
|
|
|
auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
|
|
B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFceil(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
|
assert(MRI.getType(Src) == S64);
|
|
|
|
// result = trunc(src)
|
|
// if (src > 0.0 && src != result)
|
|
// result += 1.0
|
|
|
|
auto Trunc = B.buildIntrinsicTrunc(S64, Src);
|
|
|
|
const auto Zero = B.buildFConstant(S64, 0.0);
|
|
const auto One = B.buildFConstant(S64, 1.0);
|
|
auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
|
|
auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
|
|
auto And = B.buildAnd(S1, Lt0, NeTrunc);
|
|
auto Add = B.buildSelect(S64, And, One, Zero);
|
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFrem(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register Src0Reg = MI.getOperand(1).getReg();
|
|
Register Src1Reg = MI.getOperand(2).getReg();
|
|
auto Flags = MI.getFlags();
|
|
LLT Ty = MRI.getType(DstReg);
|
|
|
|
auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
|
|
auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
|
|
auto Neg = B.buildFNeg(Ty, Trunc, Flags);
|
|
B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static MachineInstrBuilder extractF64Exponent(Register Hi,
|
|
MachineIRBuilder &B) {
|
|
const unsigned FractBits = 52;
|
|
const unsigned ExpBits = 11;
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
auto Const0 = B.buildConstant(S32, FractBits - 32);
|
|
auto Const1 = B.buildConstant(S32, ExpBits);
|
|
|
|
auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
|
|
.addUse(Hi)
|
|
.addUse(Const0.getReg(0))
|
|
.addUse(Const1.getReg(0));
|
|
|
|
return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
Register Src = MI.getOperand(1).getReg();
|
|
assert(MRI.getType(Src) == S64);
|
|
|
|
// TODO: Should this use extract since the low half is unused?
|
|
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
|
|
Register Hi = Unmerge.getReg(1);
|
|
|
|
// Extract the upper half, since this is where we will find the sign and
|
|
// exponent.
|
|
auto Exp = extractF64Exponent(Hi, B);
|
|
|
|
const unsigned FractBits = 52;
|
|
|
|
// Extract the sign bit.
|
|
const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
|
|
auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
|
|
|
|
const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
|
|
|
|
const auto Zero32 = B.buildConstant(S32, 0);
|
|
|
|
// Extend back to 64-bits.
|
|
auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
|
|
|
|
auto Shr = B.buildAShr(S64, FractMask, Exp);
|
|
auto Not = B.buildNot(S64, Shr);
|
|
auto Tmp0 = B.buildAnd(S64, Src, Not);
|
|
auto FiftyOne = B.buildConstant(S32, FractBits - 1);
|
|
|
|
auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
|
|
auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
|
|
|
|
auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
|
|
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeITOFP(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool Signed) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(1).getReg();
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
assert(MRI.getType(Src) == S64);
|
|
|
|
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
|
|
auto ThirtyTwo = B.buildConstant(S32, 32);
|
|
|
|
if (MRI.getType(Dst) == S64) {
|
|
auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
|
|
: B.buildUITOFP(S64, Unmerge.getReg(1));
|
|
|
|
auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
|
|
auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
|
|
|
|
// TODO: Should this propagate fast-math-flags?
|
|
B.buildFAdd(Dst, LdExp, CvtLo);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
assert(MRI.getType(Dst) == S32);
|
|
|
|
auto One = B.buildConstant(S32, 1);
|
|
|
|
MachineInstrBuilder ShAmt;
|
|
if (Signed) {
|
|
auto ThirtyOne = B.buildConstant(S32, 31);
|
|
auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
|
|
auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
|
|
auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
|
|
auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
|
|
.addUse(Unmerge.getReg(1));
|
|
auto LS2 = B.buildSub(S32, LS, One);
|
|
ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
|
|
} else
|
|
ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
|
|
auto Norm = B.buildShl(S64, Src, ShAmt);
|
|
auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
|
|
auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
|
|
auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
|
|
auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
|
|
auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
|
|
B.buildFLdexp(Dst, FVal, Scale);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// TODO: Copied from DAG implementation. Verify logic and document how this
|
|
// actually works.
|
|
bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B,
|
|
bool Signed) const {
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(1).getReg();
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
const LLT SrcLT = MRI.getType(Src);
|
|
assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
|
|
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
// The basic idea of converting a floating point number into a pair of 32-bit
|
|
// integers is illustrated as follows:
|
|
//
|
|
// tf := trunc(val);
|
|
// hif := floor(tf * 2^-32);
|
|
// lof := tf - hif * 2^32; // lof is always positive due to floor.
|
|
// hi := fptoi(hif);
|
|
// lo := fptoi(lof);
|
|
//
|
|
auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
|
|
MachineInstrBuilder Sign;
|
|
if (Signed && SrcLT == S32) {
|
|
// However, a 32-bit floating point number has only 23 bits mantissa and
|
|
// it's not enough to hold all the significant bits of `lof` if val is
|
|
// negative. To avoid the loss of precision, We need to take the absolute
|
|
// value after truncating and flip the result back based on the original
|
|
// signedness.
|
|
Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
|
|
Trunc = B.buildFAbs(S32, Trunc, Flags);
|
|
}
|
|
MachineInstrBuilder K0, K1;
|
|
if (SrcLT == S64) {
|
|
K0 = B.buildFConstant(
|
|
S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
|
|
K1 = B.buildFConstant(
|
|
S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
|
|
} else {
|
|
K0 = B.buildFConstant(
|
|
S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
|
|
K1 = B.buildFConstant(
|
|
S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
|
|
}
|
|
|
|
auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
|
|
auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
|
|
auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
|
|
|
|
auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
|
|
: B.buildFPTOUI(S32, FloorMul);
|
|
auto Lo = B.buildFPTOUI(S32, Fma);
|
|
|
|
if (Signed && SrcLT == S32) {
|
|
// Flip the result based on the signedness, which is either all 0s or 1s.
|
|
Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
|
|
// r := xor({lo, hi}, sign) - sign;
|
|
B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
|
|
Sign);
|
|
} else
|
|
B.buildMergeLikeInstr(Dst, {Lo, Hi});
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
MachineFunction &MF = Helper.MIRBuilder.getMF();
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
|
|
MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
|
|
|
|
// With ieee_mode disabled, the instructions have the correct behavior
|
|
// already for G_FMINNUM/G_FMAXNUM
|
|
if (!MFI->getMode().IEEE)
|
|
return !IsIEEEOp;
|
|
|
|
if (IsIEEEOp)
|
|
return true;
|
|
|
|
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
// TODO: Should move some of this into LegalizerHelper.
|
|
|
|
// TODO: Promote dynamic indexing of s16 to s32
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Vec = MI.getOperand(1).getReg();
|
|
|
|
LLT VecTy = MRI.getType(Vec);
|
|
LLT EltTy = VecTy.getElementType();
|
|
assert(EltTy == MRI.getType(Dst));
|
|
|
|
// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
|
|
// but we can't go directly to that logic becasue you can't bitcast a vector
|
|
// of pointers to a vector of integers. Therefore, introduce an intermediate
|
|
// vector of integers using ptrtoint (and inttoptr on the output) in order to
|
|
// drive the legalization forward.
|
|
if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
|
|
LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
|
|
LLT IntVecTy = VecTy.changeElementType(IntTy);
|
|
|
|
auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
|
|
auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
|
|
B.buildIntToPtr(Dst, IntElt);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// FIXME: Artifact combiner probably should have replaced the truncated
|
|
// constant before this, so we shouldn't need
|
|
// getIConstantVRegValWithLookThrough.
|
|
std::optional<ValueAndVReg> MaybeIdxVal =
|
|
getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
|
|
if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
|
|
return true;
|
|
const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
|
|
|
|
if (IdxVal < VecTy.getNumElements()) {
|
|
auto Unmerge = B.buildUnmerge(EltTy, Vec);
|
|
B.buildCopy(Dst, Unmerge.getReg(IdxVal));
|
|
} else {
|
|
B.buildUndef(Dst);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
// TODO: Should move some of this into LegalizerHelper.
|
|
|
|
// TODO: Promote dynamic indexing of s16 to s32
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Vec = MI.getOperand(1).getReg();
|
|
Register Ins = MI.getOperand(2).getReg();
|
|
|
|
LLT VecTy = MRI.getType(Vec);
|
|
LLT EltTy = VecTy.getElementType();
|
|
assert(EltTy == MRI.getType(Ins));
|
|
|
|
// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
|
|
// but we can't go directly to that logic becasue you can't bitcast a vector
|
|
// of pointers to a vector of integers. Therefore, make the pointer vector
|
|
// into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
|
|
// new value, and then inttoptr the result vector back. This will then allow
|
|
// the rest of legalization to take over.
|
|
if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
|
|
LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
|
|
LLT IntVecTy = VecTy.changeElementType(IntTy);
|
|
|
|
auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
|
|
auto IntIns = B.buildPtrToInt(IntTy, Ins);
|
|
auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
|
|
MI.getOperand(3));
|
|
B.buildIntToPtr(Dst, IntVecDest);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// FIXME: Artifact combiner probably should have replaced the truncated
|
|
// constant before this, so we shouldn't need
|
|
// getIConstantVRegValWithLookThrough.
|
|
std::optional<ValueAndVReg> MaybeIdxVal =
|
|
getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
|
|
if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
|
|
return true;
|
|
|
|
const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
|
|
|
|
unsigned NumElts = VecTy.getNumElements();
|
|
if (IdxVal < NumElts) {
|
|
SmallVector<Register, 8> SrcRegs;
|
|
for (unsigned i = 0; i < NumElts; ++i)
|
|
SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
|
|
B.buildUnmerge(SrcRegs, Vec);
|
|
|
|
SrcRegs[IdxVal] = MI.getOperand(2).getReg();
|
|
B.buildMergeLikeInstr(Dst, SrcRegs);
|
|
} else {
|
|
B.buildUndef(Dst);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeSinCos(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register SrcReg = MI.getOperand(1).getReg();
|
|
LLT Ty = MRI.getType(DstReg);
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
Register TrigVal;
|
|
auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
|
|
if (ST.hasTrigReducedRange()) {
|
|
auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
|
|
TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
|
|
.addUse(MulVal.getReg(0))
|
|
.setMIFlags(Flags)
|
|
.getReg(0);
|
|
} else
|
|
TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
|
|
|
|
Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
|
|
Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
|
|
B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
|
|
.addUse(TrigVal)
|
|
.setMIFlags(Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
|
|
MachineIRBuilder &B,
|
|
const GlobalValue *GV,
|
|
int64_t Offset,
|
|
unsigned GAFlags) const {
|
|
assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
|
|
// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
|
|
// to the following code sequence:
|
|
//
|
|
// For constant address space:
|
|
// s_getpc_b64 s[0:1]
|
|
// s_add_u32 s0, s0, $symbol
|
|
// s_addc_u32 s1, s1, 0
|
|
//
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
// a fixup or relocation is emitted to replace $symbol with a literal
|
|
// constant, which is a pc-relative offset from the encoding of the $symbol
|
|
// operand to the global variable.
|
|
//
|
|
// For global address space:
|
|
// s_getpc_b64 s[0:1]
|
|
// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
|
|
// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
|
|
//
|
|
// s_getpc_b64 returns the address of the s_add_u32 instruction and then
|
|
// fixups or relocations are emitted to replace $symbol@*@lo and
|
|
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
|
|
// which is a 64-bit pc-relative offset from the encoding of the $symbol
|
|
// operand to the global variable.
|
|
|
|
LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
|
|
Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
|
|
B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
|
|
|
|
MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
|
|
.addDef(PCReg);
|
|
|
|
MIB.addGlobalAddress(GV, Offset, GAFlags);
|
|
if (GAFlags == SIInstrInfo::MO_NONE)
|
|
MIB.addImm(0);
|
|
else
|
|
MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
|
|
|
|
if (!B.getMRI()->getRegClassOrNull(PCReg))
|
|
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
|
|
|
|
if (PtrTy.getSizeInBits() == 32)
|
|
B.buildExtract(DstReg, PCReg, 0);
|
|
return true;
|
|
}
|
|
|
|
// Emit a ABS32_LO / ABS32_HI relocation stub.
|
|
void AMDGPULegalizerInfo::buildAbsGlobalAddress(
|
|
Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
|
|
MachineRegisterInfo &MRI) const {
|
|
bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
// Use the destination directly, if and only if we store the lower address
|
|
// part only and we don't have a register class being set.
|
|
Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
|
|
? DstReg
|
|
: MRI.createGenericVirtualRegister(S32);
|
|
|
|
if (!MRI.getRegClassOrNull(AddrLo))
|
|
MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
|
|
|
|
// Write the lower half.
|
|
B.buildInstr(AMDGPU::S_MOV_B32)
|
|
.addDef(AddrLo)
|
|
.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
|
|
|
|
// If required, write the upper half as well.
|
|
if (RequiresHighHalf) {
|
|
assert(PtrTy.getSizeInBits() == 64 &&
|
|
"Must provide a 64-bit pointer type!");
|
|
|
|
Register AddrHi = MRI.createGenericVirtualRegister(S32);
|
|
MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
|
|
|
|
B.buildInstr(AMDGPU::S_MOV_B32)
|
|
.addDef(AddrHi)
|
|
.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
|
|
|
|
// Use the destination directly, if and only if we don't have a register
|
|
// class being set.
|
|
Register AddrDst = !MRI.getRegClassOrNull(DstReg)
|
|
? DstReg
|
|
: MRI.createGenericVirtualRegister(LLT::scalar(64));
|
|
|
|
if (!MRI.getRegClassOrNull(AddrDst))
|
|
MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
|
|
|
|
B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
|
|
|
|
// If we created a new register for the destination, cast the result into
|
|
// the final output.
|
|
if (AddrDst != DstReg)
|
|
B.buildCast(DstReg, AddrDst);
|
|
} else if (AddrLo != DstReg) {
|
|
// If we created a new register for the destination, cast the result into
|
|
// the final output.
|
|
B.buildCast(DstReg, AddrLo);
|
|
}
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeGlobalValue(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
LLT Ty = MRI.getType(DstReg);
|
|
unsigned AS = Ty.getAddressSpace();
|
|
|
|
const GlobalValue *GV = MI.getOperand(1).getGlobal();
|
|
MachineFunction &MF = B.getMF();
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
|
|
if (!MFI->isModuleEntryFunction() &&
|
|
GV->getName() != "llvm.amdgcn.module.lds" &&
|
|
!AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
|
|
const Function &Fn = MF.getFunction();
|
|
DiagnosticInfoUnsupported BadLDSDecl(
|
|
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
|
|
DS_Warning);
|
|
Fn.getContext().diagnose(BadLDSDecl);
|
|
|
|
// We currently don't have a way to correctly allocate LDS objects that
|
|
// aren't directly associated with a kernel. We do force inlining of
|
|
// functions that use local objects. However, if these dead functions are
|
|
// not eliminated, we don't want a compile time error. Just emit a warning
|
|
// and a trap, since there should be no callable path here.
|
|
B.buildTrap();
|
|
B.buildUndef(DstReg);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// TODO: We could emit code to handle the initialization somewhere.
|
|
// We ignore the initializer for now and legalize it to allow selection.
|
|
// The initializer will anyway get errored out during assembly emission.
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
if (!TLI->shouldUseLDSConstAddress(GV)) {
|
|
MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
|
|
return true; // Leave in place;
|
|
}
|
|
|
|
if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
|
|
Type *Ty = GV->getValueType();
|
|
// HIP uses an unsized array `extern __shared__ T s[]` or similar
|
|
// zero-sized type in other languages to declare the dynamic shared
|
|
// memory which size is not known at the compile time. They will be
|
|
// allocated by the runtime and placed directly after the static
|
|
// allocated ones. They all share the same offset.
|
|
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
|
|
// Adjust alignment for that dynamic shared memory array.
|
|
MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
|
|
LLT S32 = LLT::scalar(32);
|
|
auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
|
|
B.buildIntToPtr(DstReg, Sz);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
}
|
|
|
|
B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
|
|
*cast<GlobalVariable>(GV)));
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
|
|
buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
|
|
if (TLI->shouldEmitFixup(GV)) {
|
|
buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (TLI->shouldEmitPCReloc(GV)) {
|
|
buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
|
|
|
|
LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
|
|
MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
|
|
MachinePointerInfo::getGOT(MF),
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant,
|
|
LoadTy, Align(8));
|
|
|
|
buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
|
|
|
|
if (Ty.getSizeInBits() == 32) {
|
|
// Truncate if this is a 32-bit constant address.
|
|
auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
|
|
B.buildExtract(DstReg, Load, 0);
|
|
} else
|
|
B.buildLoad(DstReg, GOTAddr, *GOTMMO);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static LLT widenToNextPowerOf2(LLT Ty) {
|
|
if (Ty.isVector())
|
|
return Ty.changeElementCount(
|
|
ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
|
|
return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
GISelChangeObserver &Observer = Helper.Observer;
|
|
|
|
Register PtrReg = MI.getOperand(1).getReg();
|
|
LLT PtrTy = MRI.getType(PtrReg);
|
|
unsigned AddrSpace = PtrTy.getAddressSpace();
|
|
|
|
if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
|
|
LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
|
|
Observer.changingInstr(MI);
|
|
MI.getOperand(1).setReg(Cast.getReg(0));
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
|
|
if (MI.getOpcode() != AMDGPU::G_LOAD)
|
|
return false;
|
|
|
|
Register ValReg = MI.getOperand(0).getReg();
|
|
LLT ValTy = MRI.getType(ValReg);
|
|
|
|
if (hasBufferRsrcWorkaround(ValTy)) {
|
|
Observer.changingInstr(MI);
|
|
castBufferRsrcFromV4I32(MI, B, MRI, 0);
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
const unsigned ValSize = ValTy.getSizeInBits();
|
|
const LLT MemTy = MMO->getMemoryType();
|
|
const Align MemAlign = MMO->getAlign();
|
|
const unsigned MemSize = MemTy.getSizeInBits();
|
|
const uint64_t AlignInBits = 8 * MemAlign.value();
|
|
|
|
// Widen non-power-of-2 loads to the alignment if needed
|
|
if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
|
|
const unsigned WideMemSize = PowerOf2Ceil(MemSize);
|
|
|
|
// This was already the correct extending load result type, so just adjust
|
|
// the memory type.
|
|
if (WideMemSize == ValSize) {
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
MachineMemOperand *WideMMO =
|
|
MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
|
|
Observer.changingInstr(MI);
|
|
MI.setMemRefs(MF, {WideMMO});
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
|
|
// Don't bother handling edge case that should probably never be produced.
|
|
if (ValSize > WideMemSize)
|
|
return false;
|
|
|
|
LLT WideTy = widenToNextPowerOf2(ValTy);
|
|
|
|
Register WideLoad;
|
|
if (!WideTy.isVector()) {
|
|
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
|
|
B.buildTrunc(ValReg, WideLoad).getReg(0);
|
|
} else {
|
|
// Extract the subvector.
|
|
|
|
if (isRegisterType(ValTy)) {
|
|
// If this a case where G_EXTRACT is legal, use it.
|
|
// (e.g. <3 x s32> -> <4 x s32>)
|
|
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
|
|
B.buildExtract(ValReg, WideLoad, 0);
|
|
} else {
|
|
// For cases where the widened type isn't a nice register value, unmerge
|
|
// from a widened register (e.g. <3 x s16> -> <4 x s16>)
|
|
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
|
|
B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
|
|
}
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
GISelChangeObserver &Observer = Helper.Observer;
|
|
|
|
Register DataReg = MI.getOperand(0).getReg();
|
|
LLT DataTy = MRI.getType(DataReg);
|
|
|
|
if (hasBufferRsrcWorkaround(DataTy)) {
|
|
Observer.changingInstr(MI);
|
|
castBufferRsrcArgToV4I32(MI, B, 0);
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFMad(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
|
|
assert(Ty.isScalar());
|
|
|
|
MachineFunction &MF = B.getMF();
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// TODO: Always legal with future ftz flag.
|
|
// FIXME: Do we need just output?
|
|
if (Ty == LLT::float32() &&
|
|
MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
|
|
return true;
|
|
if (Ty == LLT::float16() &&
|
|
MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
|
|
return true;
|
|
|
|
MachineIRBuilder HelperBuilder(MI);
|
|
GISelObserverWrapper DummyObserver;
|
|
LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
|
|
return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register PtrReg = MI.getOperand(1).getReg();
|
|
Register CmpVal = MI.getOperand(2).getReg();
|
|
Register NewVal = MI.getOperand(3).getReg();
|
|
|
|
assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
|
|
"this should not have been custom lowered");
|
|
|
|
LLT ValTy = MRI.getType(CmpVal);
|
|
LLT VecTy = LLT::fixed_vector(2, ValTy);
|
|
|
|
Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
|
|
|
|
B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
|
|
.addDef(DstReg)
|
|
.addUse(PtrReg)
|
|
.addUse(PackedVal)
|
|
.setMemRefs(MI.memoperands());
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
/// Return true if it's known that \p Src can never be an f32 denormal value.
|
|
static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
|
|
Register Src) {
|
|
const MachineInstr *DefMI = MRI.getVRegDef(Src);
|
|
switch (DefMI->getOpcode()) {
|
|
case TargetOpcode::G_INTRINSIC: {
|
|
switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
|
|
case Intrinsic::amdgcn_frexp_mant:
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
}
|
|
case TargetOpcode::G_FFREXP: {
|
|
if (DefMI->getOperand(0).getReg() == Src)
|
|
return true;
|
|
break;
|
|
}
|
|
case TargetOpcode::G_FPEXT: {
|
|
return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
|
|
if (Flags & MachineInstr::FmAfn)
|
|
return true;
|
|
const auto &Options = MF.getTarget().Options;
|
|
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
|
|
}
|
|
|
|
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
|
|
unsigned Flags) {
|
|
return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
|
|
MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
|
|
DenormalMode::PreserveSign;
|
|
}
|
|
|
|
std::pair<Register, Register>
|
|
AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
|
|
unsigned Flags) const {
|
|
if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
|
|
return {};
|
|
|
|
const LLT F32 = LLT::scalar(32);
|
|
auto SmallestNormal = B.buildFConstant(
|
|
F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
|
|
auto IsLtSmallestNormal =
|
|
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
|
|
|
|
auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
|
|
auto One = B.buildFConstant(F32, 1.0);
|
|
auto ScaleFactor =
|
|
B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
|
|
auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
|
|
|
|
return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
|
|
// If we have to handle denormals, scale up the input and adjust the result.
|
|
|
|
// scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
|
|
// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(1).getReg();
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
if (Ty == LLT::scalar(16)) {
|
|
const LLT F32 = LLT::scalar(32);
|
|
// Nothing in half is a denormal when promoted to f32.
|
|
auto Ext = B.buildFPExt(F32, Src, Flags);
|
|
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
|
|
.addUse(Ext.getReg(0))
|
|
.setMIFlags(Flags);
|
|
B.buildFPTrunc(Dst, Log2, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
assert(Ty == LLT::scalar(32));
|
|
|
|
auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
|
|
if (!ScaledInput) {
|
|
B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
|
|
.addUse(Src)
|
|
.setMIFlags(Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
|
|
.addUse(ScaledInput)
|
|
.setMIFlags(Flags);
|
|
|
|
auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
|
|
auto Zero = B.buildFConstant(Ty, 0.0);
|
|
auto ResultOffset =
|
|
B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
|
|
B.buildFSub(Dst, Log2, ResultOffset, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
|
|
Register Z, unsigned Flags) {
|
|
auto FMul = B.buildFMul(Ty, X, Y, Flags);
|
|
return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
|
|
assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
|
|
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register X = MI.getOperand(1).getReg();
|
|
unsigned Flags = MI.getFlags();
|
|
const LLT Ty = MRI.getType(X);
|
|
MachineFunction &MF = B.getMF();
|
|
|
|
const LLT F32 = LLT::scalar(32);
|
|
const LLT F16 = LLT::scalar(16);
|
|
|
|
const AMDGPUTargetMachine &TM =
|
|
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
|
|
|
|
if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
|
|
TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
|
|
if (Ty == F16 && !ST.has16BitInsts()) {
|
|
Register LogVal = MRI.createGenericVirtualRegister(F32);
|
|
auto PromoteSrc = B.buildFPExt(F32, X);
|
|
legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
|
|
B.buildFPTrunc(Dst, LogVal);
|
|
} else {
|
|
legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
|
|
if (ScaledInput)
|
|
X = ScaledInput;
|
|
|
|
auto Y =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
|
|
|
|
Register R;
|
|
if (ST.hasFastFMAF32()) {
|
|
// c+cc are ln(2)/ln(10) to more than 49 bits
|
|
const float c_log10 = 0x1.344134p-2f;
|
|
const float cc_log10 = 0x1.09f79ep-26f;
|
|
|
|
// c + cc is ln(2) to more than 49 bits
|
|
const float c_log = 0x1.62e42ep-1f;
|
|
const float cc_log = 0x1.efa39ep-25f;
|
|
|
|
auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
|
|
auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
|
|
|
|
R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
|
|
auto NegR = B.buildFNeg(Ty, R, Flags);
|
|
auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
|
|
auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
|
|
R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
|
|
} else {
|
|
// ch+ct is ln(2)/ln(10) to more than 36 bits
|
|
const float ch_log10 = 0x1.344000p-2f;
|
|
const float ct_log10 = 0x1.3509f6p-18f;
|
|
|
|
// ch + ct is ln(2) to more than 36 bits
|
|
const float ch_log = 0x1.62e000p-1f;
|
|
const float ct_log = 0x1.0bfbe8p-15f;
|
|
|
|
auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
|
|
auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
|
|
|
|
auto MaskConst = B.buildConstant(Ty, 0xfffff000);
|
|
auto YH = B.buildAnd(Ty, Y, MaskConst);
|
|
auto YT = B.buildFSub(Ty, Y, YH, Flags);
|
|
auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
|
|
|
|
Register Mad0 =
|
|
getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
|
|
Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
|
|
R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
|
|
}
|
|
|
|
const bool IsFiniteOnly =
|
|
(MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
|
|
(MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
|
|
|
|
if (!IsFiniteOnly) {
|
|
// Expand isfinite(x) => fabs(x) < inf
|
|
auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
|
|
auto Fabs = B.buildFAbs(Ty, Y);
|
|
auto IsFinite =
|
|
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
|
|
R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
|
|
}
|
|
|
|
if (ScaledInput) {
|
|
auto Zero = B.buildFConstant(Ty, 0.0);
|
|
auto ShiftK =
|
|
B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
|
|
auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
|
|
B.buildFSub(Dst, R, Shift, Flags);
|
|
} else {
|
|
B.buildCopy(Dst, R);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
|
|
Register Src, bool IsLog10,
|
|
unsigned Flags) const {
|
|
const double Log2BaseInverted =
|
|
IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
|
|
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
|
|
if (Ty == LLT::scalar(32)) {
|
|
auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
|
|
if (ScaledInput) {
|
|
auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
|
|
.addUse(Src)
|
|
.setMIFlags(Flags);
|
|
auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
|
|
auto Zero = B.buildFConstant(Ty, 0.0);
|
|
auto ResultOffset =
|
|
B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
|
|
auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
|
|
|
|
if (ST.hasFastFMAF32())
|
|
B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
|
|
else {
|
|
auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
|
|
B.buildFAdd(Dst, Mul, ResultOffset, Flags);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
auto Log2Operand = Ty == LLT::scalar(16)
|
|
? B.buildFLog2(Ty, Src, Flags)
|
|
: B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
|
|
.addUse(Src)
|
|
.setMIFlags(Flags);
|
|
auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
|
|
B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
|
|
// If we have to handle denormals, scale up the input and adjust the result.
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(1).getReg();
|
|
unsigned Flags = MI.getFlags();
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
const LLT F16 = LLT::scalar(16);
|
|
const LLT F32 = LLT::scalar(32);
|
|
|
|
if (Ty == F16) {
|
|
// Nothing in half is a denormal when promoted to f32.
|
|
auto Ext = B.buildFPExt(F32, Src, Flags);
|
|
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
|
|
.addUse(Ext.getReg(0))
|
|
.setMIFlags(Flags);
|
|
B.buildFPTrunc(Dst, Log2, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
assert(Ty == F32);
|
|
|
|
if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
|
|
B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
|
|
.addUse(Src)
|
|
.setMIFlags(Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// bool needs_scaling = x < -0x1.f80000p+6f;
|
|
// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
|
|
|
|
// -nextafter(128.0, -1)
|
|
auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
|
|
auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
|
|
RangeCheckConst, Flags);
|
|
|
|
auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
|
|
auto Zero = B.buildFConstant(Ty, 0.0);
|
|
auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
|
|
auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
|
|
|
|
auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
|
|
.addUse(AddInput.getReg(0))
|
|
.setMIFlags(Flags);
|
|
|
|
auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
|
|
auto One = B.buildFConstant(Ty, 1.0);
|
|
auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
|
|
B.buildFMul(Dst, Exp2, ResultScale, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
|
|
Register X, unsigned Flags) const {
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
LLT F32 = LLT::scalar(32);
|
|
|
|
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
|
|
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
|
|
auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
|
|
|
|
if (Ty == F32) {
|
|
B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
|
|
.addUse(Mul.getReg(0))
|
|
.setMIFlags(Flags);
|
|
} else {
|
|
B.buildFExp2(Dst, Mul.getReg(0), Flags);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
|
|
auto NeedsScaling =
|
|
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
|
|
auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
|
|
auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
|
|
auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
|
|
|
|
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
|
|
auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
|
|
|
|
auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
|
|
.addUse(ExpInput.getReg(0))
|
|
.setMIFlags(Flags);
|
|
|
|
auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
|
|
auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
|
|
B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register X = MI.getOperand(1).getReg();
|
|
const unsigned Flags = MI.getFlags();
|
|
MachineFunction &MF = B.getMF();
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
LLT Ty = MRI.getType(Dst);
|
|
const LLT F16 = LLT::scalar(16);
|
|
const LLT F32 = LLT::scalar(32);
|
|
const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
|
|
|
|
if (Ty == F16) {
|
|
// v_exp_f16 (fmul x, log2e)
|
|
if (allowApproxFunc(MF, Flags)) {
|
|
// TODO: Does this really require fast?
|
|
legalizeFExpUnsafe(B, Dst, X, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// exp(f16 x) ->
|
|
// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
|
|
|
|
// Nothing in half is a denormal when promoted to f32.
|
|
auto Ext = B.buildFPExt(F32, X, Flags);
|
|
Register Lowered = MRI.createGenericVirtualRegister(F32);
|
|
legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
|
|
B.buildFPTrunc(Dst, Lowered, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
assert(Ty == F32);
|
|
|
|
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
|
|
// library behavior. Also, is known-not-daz source sufficient?
|
|
if (allowApproxFunc(MF, Flags)) {
|
|
legalizeFExpUnsafe(B, Dst, X, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Algorithm:
|
|
//
|
|
// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
|
|
//
|
|
// x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
|
|
// n = 64*m + j, 0 <= j < 64
|
|
//
|
|
// e^x = 2^((64*m + j + f)/64)
|
|
// = (2^m) * (2^(j/64)) * 2^(f/64)
|
|
// = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
|
|
//
|
|
// f = x*(64/ln(2)) - n
|
|
// r = f*(ln(2)/64) = x - n*(ln(2)/64)
|
|
//
|
|
// e^x = (2^m) * (2^(j/64)) * e^r
|
|
//
|
|
// (2^(j/64)) is precomputed
|
|
//
|
|
// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
|
|
// e^r = 1 + q
|
|
//
|
|
// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
|
|
//
|
|
// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
|
|
const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
|
|
Register PH, PL;
|
|
|
|
if (ST.hasFastFMAF32()) {
|
|
const float c_exp = numbers::log2ef;
|
|
const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
|
|
const float c_exp10 = 0x1.a934f0p+1f;
|
|
const float cc_exp10 = 0x1.2f346ep-24f;
|
|
|
|
auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
|
|
PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
|
|
auto NegPH = B.buildFNeg(Ty, PH, Flags);
|
|
auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
|
|
|
|
auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
|
|
PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
|
|
} else {
|
|
const float ch_exp = 0x1.714000p+0f;
|
|
const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
|
|
|
|
const float ch_exp10 = 0x1.a92000p+1f;
|
|
const float cl_exp10 = 0x1.4f0978p-11f;
|
|
|
|
auto MaskConst = B.buildConstant(Ty, 0xfffff000);
|
|
auto XH = B.buildAnd(Ty, X, MaskConst);
|
|
auto XL = B.buildFSub(Ty, X, XH, Flags);
|
|
|
|
auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
|
|
PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
|
|
|
|
auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
|
|
auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
|
|
|
|
Register Mad0 =
|
|
getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
|
|
PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
|
|
}
|
|
|
|
auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
|
|
|
|
// It is unsafe to contract this fsub into the PH multiply.
|
|
auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
|
|
auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
|
|
auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
|
|
|
|
auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
|
|
.addUse(A.getReg(0))
|
|
.setMIFlags(Flags);
|
|
auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
|
|
|
|
auto UnderflowCheckConst =
|
|
B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
|
|
auto Zero = B.buildFConstant(Ty, 0.0);
|
|
auto Underflow =
|
|
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
|
|
|
|
R = B.buildSelect(Ty, Underflow, Zero, R);
|
|
|
|
const auto &Options = MF.getTarget().Options;
|
|
|
|
if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
|
|
auto OverflowCheckConst =
|
|
B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
|
|
|
|
auto Overflow =
|
|
B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
|
|
auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
|
|
R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
|
|
}
|
|
|
|
B.buildCopy(Dst, R);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src0 = MI.getOperand(1).getReg();
|
|
Register Src1 = MI.getOperand(2).getReg();
|
|
unsigned Flags = MI.getFlags();
|
|
LLT Ty = B.getMRI()->getType(Dst);
|
|
const LLT F16 = LLT::float16();
|
|
const LLT F32 = LLT::float32();
|
|
|
|
if (Ty == F32) {
|
|
auto Log = B.buildFLog2(F32, Src0, Flags);
|
|
auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
|
|
.addUse(Log.getReg(0))
|
|
.addUse(Src1)
|
|
.setMIFlags(Flags);
|
|
B.buildFExp2(Dst, Mul, Flags);
|
|
} else if (Ty == F16) {
|
|
// There's no f16 fmul_legacy, so we need to convert for it.
|
|
auto Log = B.buildFLog2(F16, Src0, Flags);
|
|
auto Ext0 = B.buildFPExt(F32, Log, Flags);
|
|
auto Ext1 = B.buildFPExt(F32, Src1, Flags);
|
|
auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
|
|
.addUse(Ext0.getReg(0))
|
|
.addUse(Ext1.getReg(0))
|
|
.setMIFlags(Flags);
|
|
B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
|
|
} else
|
|
return false;
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Find a source register, ignoring any possible source modifiers.
|
|
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
|
|
Register ModSrc = OrigSrc;
|
|
if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
|
|
ModSrc = SrcFNeg->getOperand(1).getReg();
|
|
if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
|
|
ModSrc = SrcFAbs->getOperand(1).getReg();
|
|
} else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
|
|
ModSrc = SrcFAbs->getOperand(1).getReg();
|
|
return ModSrc;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT F64 = LLT::float64();
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register OrigSrc = MI.getOperand(1).getReg();
|
|
unsigned Flags = MI.getFlags();
|
|
assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
|
|
"this should not have been custom lowered");
|
|
|
|
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
|
|
// is used instead. However, SI doesn't have V_FLOOR_F64, so the most
|
|
// efficient way to implement it is using V_FRACT_F64. The workaround for the
|
|
// V_FRACT bug is:
|
|
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
|
|
//
|
|
// Convert floor(x) to (x - fract(x))
|
|
|
|
auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
|
|
.addUse(OrigSrc)
|
|
.setMIFlags(Flags);
|
|
|
|
// Give source modifier matching some assistance before obscuring a foldable
|
|
// pattern.
|
|
|
|
// TODO: We can avoid the neg on the fract? The input sign to fract
|
|
// shouldn't matter?
|
|
Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
|
|
|
|
auto Const =
|
|
B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
|
|
|
|
Register Min = MRI.createGenericVirtualRegister(F64);
|
|
|
|
// We don't need to concern ourselves with the snan handling difference, so
|
|
// use the one which will directly select.
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
if (MFI->getMode().IEEE)
|
|
B.buildFMinNumIEEE(Min, Fract, Const, Flags);
|
|
else
|
|
B.buildFMinNum(Min, Fract, Const, Flags);
|
|
|
|
Register CorrectedFract = Min;
|
|
if (!MI.getFlag(MachineInstr::FmNoNans)) {
|
|
auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
|
|
CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
|
|
}
|
|
|
|
auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
|
|
B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Turn an illegal packed v2s16 build vector into bit operations.
|
|
// TODO: This should probably be a bitcast action in LegalizerHelper.
|
|
bool AMDGPULegalizerInfo::legalizeBuildVector(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT S16 = LLT::scalar(16);
|
|
assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
|
|
|
|
Register Src0 = MI.getOperand(1).getReg();
|
|
Register Src1 = MI.getOperand(2).getReg();
|
|
|
|
if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
|
|
assert(MRI.getType(Src0) == S32);
|
|
Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
|
|
Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
|
|
}
|
|
|
|
auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
|
|
B.buildBitcast(Dst, Merge);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
|
|
//
|
|
// Source and accumulation registers must all be 32-bits.
|
|
//
|
|
// TODO: When the multiply is uniform, we should produce a code sequence
|
|
// that is better suited to instruction selection on the SALU. Instead of
|
|
// the outer loop going over parts of the result, the outer loop should go
|
|
// over parts of one of the factors. This should result in instruction
|
|
// selection that makes full use of S_ADDC_U32 instructions.
|
|
void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
|
|
MutableArrayRef<Register> Accum,
|
|
ArrayRef<Register> Src0,
|
|
ArrayRef<Register> Src1,
|
|
bool UsePartialMad64_32,
|
|
bool SeparateOddAlignedProducts) const {
|
|
// Use (possibly empty) vectors of S1 registers to represent the set of
|
|
// carries from one pair of positions to the next.
|
|
using Carry = SmallVector<Register, 2>;
|
|
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
GISelKnownBits &KB = *Helper.getKnownBits();
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
Register Zero32;
|
|
Register Zero64;
|
|
|
|
auto getZero32 = [&]() -> Register {
|
|
if (!Zero32)
|
|
Zero32 = B.buildConstant(S32, 0).getReg(0);
|
|
return Zero32;
|
|
};
|
|
auto getZero64 = [&]() -> Register {
|
|
if (!Zero64)
|
|
Zero64 = B.buildConstant(S64, 0).getReg(0);
|
|
return Zero64;
|
|
};
|
|
|
|
SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
|
|
for (unsigned i = 0; i < Src0.size(); ++i) {
|
|
Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
|
|
Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
|
|
}
|
|
|
|
// Merge the given carries into the 32-bit LocalAccum, which is modified
|
|
// in-place.
|
|
//
|
|
// Returns the carry-out, which is a single S1 register or null.
|
|
auto mergeCarry =
|
|
[&](Register &LocalAccum, const Carry &CarryIn) -> Register {
|
|
if (CarryIn.empty())
|
|
return Register();
|
|
|
|
bool HaveCarryOut = true;
|
|
Register CarryAccum;
|
|
if (CarryIn.size() == 1) {
|
|
if (!LocalAccum) {
|
|
LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
|
|
return Register();
|
|
}
|
|
|
|
CarryAccum = getZero32();
|
|
} else {
|
|
CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
|
|
for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
|
|
CarryAccum =
|
|
B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
|
|
.getReg(0);
|
|
}
|
|
|
|
if (!LocalAccum) {
|
|
LocalAccum = getZero32();
|
|
HaveCarryOut = false;
|
|
}
|
|
}
|
|
|
|
auto Add =
|
|
B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
|
|
LocalAccum = Add.getReg(0);
|
|
return HaveCarryOut ? Add.getReg(1) : Register();
|
|
};
|
|
|
|
// Build a multiply-add chain to compute
|
|
//
|
|
// LocalAccum + (partial products at DstIndex)
|
|
// + (opportunistic subset of CarryIn)
|
|
//
|
|
// LocalAccum is an array of one or two 32-bit registers that are updated
|
|
// in-place. The incoming registers may be null.
|
|
//
|
|
// In some edge cases, carry-ins can be consumed "for free". In that case,
|
|
// the consumed carry bits are removed from CarryIn in-place.
|
|
auto buildMadChain =
|
|
[&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
|
|
-> Carry {
|
|
assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
|
|
(DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
|
|
|
|
Carry CarryOut;
|
|
unsigned j0 = 0;
|
|
|
|
// Use plain 32-bit multiplication for the most significant part of the
|
|
// result by default.
|
|
if (LocalAccum.size() == 1 &&
|
|
(!UsePartialMad64_32 || !CarryIn.empty())) {
|
|
do {
|
|
// Skip multiplication if one of the operands is 0
|
|
unsigned j1 = DstIndex - j0;
|
|
if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
|
|
++j0;
|
|
continue;
|
|
}
|
|
auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
|
|
if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
|
|
LocalAccum[0] = Mul.getReg(0);
|
|
} else {
|
|
if (CarryIn.empty()) {
|
|
LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
|
|
} else {
|
|
LocalAccum[0] =
|
|
B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
|
|
.getReg(0);
|
|
CarryIn.pop_back();
|
|
}
|
|
}
|
|
++j0;
|
|
} while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
|
|
}
|
|
|
|
// Build full 64-bit multiplies.
|
|
if (j0 <= DstIndex) {
|
|
bool HaveSmallAccum = false;
|
|
Register Tmp;
|
|
|
|
if (LocalAccum[0]) {
|
|
if (LocalAccum.size() == 1) {
|
|
Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
|
|
HaveSmallAccum = true;
|
|
} else if (LocalAccum[1]) {
|
|
Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
|
|
HaveSmallAccum = false;
|
|
} else {
|
|
Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
|
|
HaveSmallAccum = true;
|
|
}
|
|
} else {
|
|
assert(LocalAccum.size() == 1 || !LocalAccum[1]);
|
|
Tmp = getZero64();
|
|
HaveSmallAccum = true;
|
|
}
|
|
|
|
do {
|
|
unsigned j1 = DstIndex - j0;
|
|
if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
|
|
++j0;
|
|
continue;
|
|
}
|
|
auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
|
|
{Src0[j0], Src1[j1], Tmp});
|
|
Tmp = Mad.getReg(0);
|
|
if (!HaveSmallAccum)
|
|
CarryOut.push_back(Mad.getReg(1));
|
|
HaveSmallAccum = false;
|
|
|
|
++j0;
|
|
} while (j0 <= DstIndex);
|
|
|
|
auto Unmerge = B.buildUnmerge(S32, Tmp);
|
|
LocalAccum[0] = Unmerge.getReg(0);
|
|
if (LocalAccum.size() > 1)
|
|
LocalAccum[1] = Unmerge.getReg(1);
|
|
}
|
|
|
|
return CarryOut;
|
|
};
|
|
|
|
// Outer multiply loop, iterating over destination parts from least
|
|
// significant to most significant parts.
|
|
//
|
|
// The columns of the following diagram correspond to the destination parts
|
|
// affected by one iteration of the outer loop (ignoring boundary
|
|
// conditions).
|
|
//
|
|
// Dest index relative to 2 * i: 1 0 -1
|
|
// ------
|
|
// Carries from previous iteration: e o
|
|
// Even-aligned partial product sum: E E .
|
|
// Odd-aligned partial product sum: O O
|
|
//
|
|
// 'o' is OddCarry, 'e' is EvenCarry.
|
|
// EE and OO are computed from partial products via buildMadChain and use
|
|
// accumulation where possible and appropriate.
|
|
//
|
|
Register SeparateOddCarry;
|
|
Carry EvenCarry;
|
|
Carry OddCarry;
|
|
|
|
for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
|
|
Carry OddCarryIn = std::move(OddCarry);
|
|
Carry EvenCarryIn = std::move(EvenCarry);
|
|
OddCarry.clear();
|
|
EvenCarry.clear();
|
|
|
|
// Partial products at offset 2 * i.
|
|
if (2 * i < Accum.size()) {
|
|
auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
|
|
EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
|
|
}
|
|
|
|
// Partial products at offset 2 * i - 1.
|
|
if (i > 0) {
|
|
if (!SeparateOddAlignedProducts) {
|
|
auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
|
|
OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
|
|
} else {
|
|
bool IsHighest = 2 * i >= Accum.size();
|
|
Register SeparateOddOut[2];
|
|
auto LocalAccum = MutableArrayRef(SeparateOddOut)
|
|
.take_front(IsHighest ? 1 : 2);
|
|
OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
|
|
|
|
MachineInstr *Lo;
|
|
|
|
if (i == 1) {
|
|
if (!IsHighest)
|
|
Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
|
|
else
|
|
Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
|
|
} else {
|
|
Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
|
|
SeparateOddCarry);
|
|
}
|
|
Accum[2 * i - 1] = Lo->getOperand(0).getReg();
|
|
|
|
if (!IsHighest) {
|
|
auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
|
|
Lo->getOperand(1).getReg());
|
|
Accum[2 * i] = Hi.getReg(0);
|
|
SeparateOddCarry = Hi.getReg(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add in the carries from the previous iteration
|
|
if (i > 0) {
|
|
if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
|
|
EvenCarryIn.push_back(CarryOut);
|
|
|
|
if (2 * i < Accum.size()) {
|
|
if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
|
|
OddCarry.push_back(CarryOut);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Custom narrowing of wide multiplies using wide multiply-add instructions.
|
|
//
|
|
// TODO: If the multiply is followed by an addition, we should attempt to
|
|
// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
|
|
bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
assert(ST.hasMad64_32());
|
|
assert(MI.getOpcode() == TargetOpcode::G_MUL);
|
|
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register Src0 = MI.getOperand(1).getReg();
|
|
Register Src1 = MI.getOperand(2).getReg();
|
|
|
|
LLT Ty = MRI.getType(DstReg);
|
|
assert(Ty.isScalar());
|
|
|
|
unsigned Size = Ty.getSizeInBits();
|
|
unsigned NumParts = Size / 32;
|
|
assert((Size % 32) == 0);
|
|
assert(NumParts >= 2);
|
|
|
|
// Whether to use MAD_64_32 for partial products whose high half is
|
|
// discarded. This avoids some ADD instructions but risks false dependency
|
|
// stalls on some subtargets in some cases.
|
|
const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
|
|
|
|
// Whether to compute odd-aligned partial products separately. This is
|
|
// advisable on subtargets where the accumulator of MAD_64_32 must be placed
|
|
// in an even-aligned VGPR.
|
|
const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
SmallVector<Register, 2> Src0Parts, Src1Parts;
|
|
for (unsigned i = 0; i < NumParts; ++i) {
|
|
Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
|
|
Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
|
|
}
|
|
B.buildUnmerge(Src0Parts, Src0);
|
|
B.buildUnmerge(Src1Parts, Src1);
|
|
|
|
SmallVector<Register, 2> AccumRegs(NumParts);
|
|
buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
|
|
SeparateOddAlignedProducts);
|
|
|
|
B.buildMergeLikeInstr(DstReg, AccumRegs);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
|
|
// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
|
|
// case with a single min instruction instead of a compare+select.
|
|
bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(1).getReg();
|
|
LLT DstTy = MRI.getType(Dst);
|
|
LLT SrcTy = MRI.getType(Src);
|
|
|
|
unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
|
|
? AMDGPU::G_AMDGPU_FFBH_U32
|
|
: AMDGPU::G_AMDGPU_FFBL_B32;
|
|
auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
|
|
B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(1).getReg();
|
|
LLT SrcTy = MRI.getType(Src);
|
|
TypeSize NumBits = SrcTy.getSizeInBits();
|
|
|
|
assert(NumBits < 32u);
|
|
|
|
auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
|
|
auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
|
|
auto Shift = B.buildShl(S32, Extend, ShiftAmt);
|
|
auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
|
|
B.buildTrunc(Dst, Ctlz);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Check that this is a G_XOR x, -1
|
|
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
|
|
if (MI.getOpcode() != TargetOpcode::G_XOR)
|
|
return false;
|
|
auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
|
|
return ConstVal && *ConstVal == -1;
|
|
}
|
|
|
|
// Return the use branch instruction, otherwise null if the usage is invalid.
|
|
static MachineInstr *
|
|
verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
|
|
MachineBasicBlock *&UncondBrTarget, bool &Negated) {
|
|
Register CondDef = MI.getOperand(0).getReg();
|
|
if (!MRI.hasOneNonDBGUse(CondDef))
|
|
return nullptr;
|
|
|
|
MachineBasicBlock *Parent = MI.getParent();
|
|
MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
|
|
|
|
if (isNot(MRI, *UseMI)) {
|
|
Register NegatedCond = UseMI->getOperand(0).getReg();
|
|
if (!MRI.hasOneNonDBGUse(NegatedCond))
|
|
return nullptr;
|
|
|
|
// We're deleting the def of this value, so we need to remove it.
|
|
eraseInstr(*UseMI, MRI);
|
|
|
|
UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
|
|
Negated = true;
|
|
}
|
|
|
|
if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
|
|
return nullptr;
|
|
|
|
// Make sure the cond br is followed by a G_BR, or is the last instruction.
|
|
MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
|
|
if (Next == Parent->end()) {
|
|
MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
|
|
if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
|
|
return nullptr;
|
|
UncondBrTarget = &*NextMBB;
|
|
} else {
|
|
if (Next->getOpcode() != AMDGPU::G_BR)
|
|
return nullptr;
|
|
Br = &*Next;
|
|
UncondBrTarget = Br->getOperand(0).getMBB();
|
|
}
|
|
|
|
return UseMI;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
|
|
const ArgDescriptor *Arg,
|
|
const TargetRegisterClass *ArgRC,
|
|
LLT ArgTy) const {
|
|
MCRegister SrcReg = Arg->getRegister();
|
|
assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
|
|
assert(DstReg.isVirtual() && "Virtual register expected");
|
|
|
|
Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
|
|
*ArgRC, B.getDebugLoc(), ArgTy);
|
|
if (Arg->isMasked()) {
|
|
// TODO: Should we try to emit this once in the entry block?
|
|
const LLT S32 = LLT::scalar(32);
|
|
const unsigned Mask = Arg->getMask();
|
|
const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
|
|
|
|
Register AndMaskSrc = LiveIn;
|
|
|
|
// TODO: Avoid clearing the high bits if we know workitem id y/z are always
|
|
// 0.
|
|
if (Shift != 0) {
|
|
auto ShiftAmt = B.buildConstant(S32, Shift);
|
|
AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
|
|
}
|
|
|
|
B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
|
|
} else {
|
|
B.buildCopy(DstReg, LiveIn);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::loadInputValue(
|
|
Register DstReg, MachineIRBuilder &B,
|
|
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
const ArgDescriptor *Arg = nullptr;
|
|
const TargetRegisterClass *ArgRC;
|
|
LLT ArgTy;
|
|
|
|
CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
|
|
const ArgDescriptor WorkGroupIDX =
|
|
ArgDescriptor::createRegister(AMDGPU::TTMP9);
|
|
// If GridZ is not programmed in an entry function then the hardware will set
|
|
// it to all zeros, so there is no need to mask the GridY value in the low
|
|
// order bits.
|
|
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
|
|
AMDGPU::TTMP7,
|
|
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
|
|
const ArgDescriptor WorkGroupIDZ =
|
|
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
|
|
if (ST.hasArchitectedSGPRs() &&
|
|
(AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
|
|
switch (ArgType) {
|
|
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
|
|
Arg = &WorkGroupIDX;
|
|
ArgRC = &AMDGPU::SReg_32RegClass;
|
|
ArgTy = LLT::scalar(32);
|
|
break;
|
|
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
|
|
Arg = &WorkGroupIDY;
|
|
ArgRC = &AMDGPU::SReg_32RegClass;
|
|
ArgTy = LLT::scalar(32);
|
|
break;
|
|
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
|
|
Arg = &WorkGroupIDZ;
|
|
ArgRC = &AMDGPU::SReg_32RegClass;
|
|
ArgTy = LLT::scalar(32);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!Arg)
|
|
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
|
|
|
|
if (!Arg) {
|
|
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
|
|
// The intrinsic may appear when we have a 0 sized kernarg segment, in which
|
|
// case the pointer argument may be missing and we use null.
|
|
B.buildConstant(DstReg, 0);
|
|
return true;
|
|
}
|
|
|
|
// It's undefined behavior if a function marked with the amdgpu-no-*
|
|
// attributes uses the corresponding intrinsic.
|
|
B.buildUndef(DstReg);
|
|
return true;
|
|
}
|
|
|
|
if (!Arg->isRegister() || !Arg->getRegister().isValid())
|
|
return false; // TODO: Handle these
|
|
return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
|
|
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
|
|
if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
|
|
return false;
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
|
|
int64_t C) {
|
|
B.buildConstant(MI.getOperand(0).getReg(), C);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
|
|
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
|
|
unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
|
|
if (MaxID == 0)
|
|
return replaceWithConstant(B, MI, 0);
|
|
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
const ArgDescriptor *Arg;
|
|
const TargetRegisterClass *ArgRC;
|
|
LLT ArgTy;
|
|
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
if (!Arg) {
|
|
// It's undefined behavior if a function marked with the amdgpu-no-*
|
|
// attributes uses the corresponding intrinsic.
|
|
B.buildUndef(DstReg);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (Arg->isMasked()) {
|
|
// Don't bother inserting AssertZext for packed IDs since we're emitting the
|
|
// masking operations anyway.
|
|
//
|
|
// TODO: We could assert the top bit is 0 for the source copy.
|
|
if (!loadInputValue(DstReg, B, ArgType))
|
|
return false;
|
|
} else {
|
|
Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
|
|
if (!loadInputValue(TmpReg, B, ArgType))
|
|
return false;
|
|
B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
|
|
int64_t Offset) const {
|
|
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
|
|
Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
|
|
|
|
// TODO: If we passed in the base kernel offset we could have a better
|
|
// alignment than 4, but we don't really need it.
|
|
if (!loadInputValue(KernArgReg, B,
|
|
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
|
|
llvm_unreachable("failed to find kernarg segment ptr");
|
|
|
|
auto COffset = B.buildConstant(LLT::scalar(64), Offset);
|
|
// TODO: Should get nuw
|
|
return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
|
|
}
|
|
|
|
/// Legalize a value that's loaded from kernel arguments. This is only used by
|
|
/// legacy intrinsics.
|
|
bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
|
|
MachineIRBuilder &B,
|
|
uint64_t Offset,
|
|
Align Alignment) const {
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
|
|
"unexpected kernarg parameter type");
|
|
|
|
Register Ptr = getKernargParameterPtr(B, Offset);
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
|
B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
|
|
MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
LLT DstTy = MRI.getType(Dst);
|
|
LLT S16 = LLT::scalar(16);
|
|
LLT S32 = LLT::scalar(32);
|
|
LLT S64 = LLT::scalar(64);
|
|
|
|
if (DstTy == S16)
|
|
return legalizeFDIV16(MI, MRI, B);
|
|
if (DstTy == S32)
|
|
return legalizeFDIV32(MI, MRI, B);
|
|
if (DstTy == S64)
|
|
return legalizeFDIV64(MI, MRI, B);
|
|
|
|
return false;
|
|
}
|
|
|
|
void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
|
|
Register DstDivReg,
|
|
Register DstRemReg,
|
|
Register X,
|
|
Register Y) const {
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
|
|
// algorithm used here.
|
|
|
|
// Initial estimate of inv(y).
|
|
auto FloatY = B.buildUITOFP(S32, Y);
|
|
auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
|
|
auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
|
|
auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
|
|
auto Z = B.buildFPTOUI(S32, ScaledY);
|
|
|
|
// One round of UNR.
|
|
auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
|
|
auto NegYZ = B.buildMul(S32, NegY, Z);
|
|
Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
|
|
|
|
// Quotient/remainder estimate.
|
|
auto Q = B.buildUMulH(S32, X, Z);
|
|
auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
|
|
|
|
// First quotient/remainder refinement.
|
|
auto One = B.buildConstant(S32, 1);
|
|
auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
|
|
if (DstDivReg)
|
|
Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
|
|
R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
|
|
|
|
// Second quotient/remainder refinement.
|
|
Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
|
|
if (DstDivReg)
|
|
B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
|
|
|
|
if (DstRemReg)
|
|
B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
|
|
}
|
|
|
|
// Build integer reciprocal sequence around V_RCP_IFLAG_F32
|
|
//
|
|
// Return lo, hi of result
|
|
//
|
|
// %cvt.lo = G_UITOFP Val.lo
|
|
// %cvt.hi = G_UITOFP Val.hi
|
|
// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
|
|
// %rcp = G_AMDGPU_RCP_IFLAG %mad
|
|
// %mul1 = G_FMUL %rcp, 0x5f7ffffc
|
|
// %mul2 = G_FMUL %mul1, 2**(-32)
|
|
// %trunc = G_INTRINSIC_TRUNC %mul2
|
|
// %mad2 = G_FMAD %trunc, -(2**32), %mul1
|
|
// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
|
|
static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
|
|
Register Val) {
|
|
const LLT S32 = LLT::scalar(32);
|
|
auto Unmerge = B.buildUnmerge(S32, Val);
|
|
|
|
auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
|
|
auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
|
|
|
|
auto Mad = B.buildFMAD(
|
|
S32, CvtHi, // 2**32
|
|
B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
|
|
|
|
auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
|
|
auto Mul1 = B.buildFMul(
|
|
S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
|
|
|
|
// 2**(-32)
|
|
auto Mul2 = B.buildFMul(
|
|
S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
|
|
auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
|
|
|
|
// -(2**32)
|
|
auto Mad2 = B.buildFMAD(
|
|
S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
|
|
Mul1);
|
|
|
|
auto ResultLo = B.buildFPTOUI(S32, Mad2);
|
|
auto ResultHi = B.buildFPTOUI(S32, Trunc);
|
|
|
|
return {ResultLo.getReg(0), ResultHi.getReg(0)};
|
|
}
|
|
|
|
void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
|
|
Register DstDivReg,
|
|
Register DstRemReg,
|
|
Register Numer,
|
|
Register Denom) const {
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT S64 = LLT::scalar(64);
|
|
const LLT S1 = LLT::scalar(1);
|
|
Register RcpLo, RcpHi;
|
|
|
|
std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
|
|
|
|
auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
|
|
|
|
auto Zero64 = B.buildConstant(S64, 0);
|
|
auto NegDenom = B.buildSub(S64, Zero64, Denom);
|
|
|
|
auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
|
|
auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
|
|
|
|
auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
|
|
Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
|
|
Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
|
|
|
|
auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
|
|
auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
|
|
auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
|
|
|
|
auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
|
|
auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
|
|
auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
|
|
Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
|
|
Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
|
|
|
|
auto Zero32 = B.buildConstant(S32, 0);
|
|
auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
|
|
auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
|
|
auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
|
|
|
|
auto UnmergeNumer = B.buildUnmerge(S32, Numer);
|
|
Register NumerLo = UnmergeNumer.getReg(0);
|
|
Register NumerHi = UnmergeNumer.getReg(1);
|
|
|
|
auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
|
|
auto Mul3 = B.buildMul(S64, Denom, MulHi3);
|
|
auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
|
|
Register Mul3_Lo = UnmergeMul3.getReg(0);
|
|
Register Mul3_Hi = UnmergeMul3.getReg(1);
|
|
auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
|
|
auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
|
|
auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
|
|
auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
|
|
|
|
auto UnmergeDenom = B.buildUnmerge(S32, Denom);
|
|
Register DenomLo = UnmergeDenom.getReg(0);
|
|
Register DenomHi = UnmergeDenom.getReg(1);
|
|
|
|
auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
|
|
auto C1 = B.buildSExt(S32, CmpHi);
|
|
|
|
auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
|
|
auto C2 = B.buildSExt(S32, CmpLo);
|
|
|
|
auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
|
|
auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
|
|
|
|
// TODO: Here and below portions of the code can be enclosed into if/endif.
|
|
// Currently control flow is unconditional and we have 4 selects after
|
|
// potential endif to substitute PHIs.
|
|
|
|
// if C3 != 0 ...
|
|
auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
|
|
auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
|
|
auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
|
|
auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
|
|
|
|
auto One64 = B.buildConstant(S64, 1);
|
|
auto Add3 = B.buildAdd(S64, MulHi3, One64);
|
|
|
|
auto C4 =
|
|
B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
|
|
auto C5 =
|
|
B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
|
|
auto C6 = B.buildSelect(
|
|
S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
|
|
|
|
// if (C6 != 0)
|
|
auto Add4 = B.buildAdd(S64, Add3, One64);
|
|
auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
|
|
|
|
auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
|
|
auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
|
|
auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
|
|
|
|
// endif C6
|
|
// endif C3
|
|
|
|
if (DstDivReg) {
|
|
auto Sel1 = B.buildSelect(
|
|
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
|
|
B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
|
|
Sel1, MulHi3);
|
|
}
|
|
|
|
if (DstRemReg) {
|
|
auto Sel2 = B.buildSelect(
|
|
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
|
|
B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
|
|
Sel2, Sub1);
|
|
}
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register DstDivReg, DstRemReg;
|
|
switch (MI.getOpcode()) {
|
|
default:
|
|
llvm_unreachable("Unexpected opcode!");
|
|
case AMDGPU::G_UDIV: {
|
|
DstDivReg = MI.getOperand(0).getReg();
|
|
break;
|
|
}
|
|
case AMDGPU::G_UREM: {
|
|
DstRemReg = MI.getOperand(0).getReg();
|
|
break;
|
|
}
|
|
case AMDGPU::G_UDIVREM: {
|
|
DstDivReg = MI.getOperand(0).getReg();
|
|
DstRemReg = MI.getOperand(1).getReg();
|
|
break;
|
|
}
|
|
}
|
|
|
|
const LLT S64 = LLT::scalar(64);
|
|
const LLT S32 = LLT::scalar(32);
|
|
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
|
|
Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
|
|
Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
|
|
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
|
|
|
|
if (Ty == S32)
|
|
legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
|
|
else if (Ty == S64)
|
|
legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
|
|
else
|
|
return false;
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
const LLT S64 = LLT::scalar(64);
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
|
|
if (Ty != S32 && Ty != S64)
|
|
return false;
|
|
|
|
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
|
|
Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
|
|
Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
|
|
|
|
auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
|
|
auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
|
|
auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
|
|
|
|
LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
|
|
RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
|
|
|
|
LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
|
|
RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
|
|
|
|
Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
|
|
switch (MI.getOpcode()) {
|
|
default:
|
|
llvm_unreachable("Unexpected opcode!");
|
|
case AMDGPU::G_SDIV: {
|
|
DstDivReg = MI.getOperand(0).getReg();
|
|
TmpDivReg = MRI.createGenericVirtualRegister(Ty);
|
|
break;
|
|
}
|
|
case AMDGPU::G_SREM: {
|
|
DstRemReg = MI.getOperand(0).getReg();
|
|
TmpRemReg = MRI.createGenericVirtualRegister(Ty);
|
|
break;
|
|
}
|
|
case AMDGPU::G_SDIVREM: {
|
|
DstDivReg = MI.getOperand(0).getReg();
|
|
DstRemReg = MI.getOperand(1).getReg();
|
|
TmpDivReg = MRI.createGenericVirtualRegister(Ty);
|
|
TmpRemReg = MRI.createGenericVirtualRegister(Ty);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (Ty == S32)
|
|
legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
|
|
else
|
|
legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
|
|
|
|
if (DstDivReg) {
|
|
auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
|
|
auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
|
|
B.buildSub(DstDivReg, SignXor, Sign);
|
|
}
|
|
|
|
if (DstRemReg) {
|
|
auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
|
|
auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
|
|
B.buildSub(DstRemReg, SignXor, Sign);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Res = MI.getOperand(0).getReg();
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
uint16_t Flags = MI.getFlags();
|
|
LLT ResTy = MRI.getType(Res);
|
|
|
|
const MachineFunction &MF = B.getMF();
|
|
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
|
|
MF.getTarget().Options.UnsafeFPMath;
|
|
|
|
if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
|
|
if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
|
|
return false;
|
|
|
|
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
|
|
// the CI documentation has a worst case error of 1 ulp.
|
|
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
|
|
// use it as long as we aren't trying to use denormals.
|
|
//
|
|
// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
|
|
|
|
// 1 / x -> RCP(x)
|
|
if (CLHS->isExactlyValue(1.0)) {
|
|
B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
|
|
.addUse(RHS)
|
|
.setMIFlags(Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// -1 / x -> RCP( FNEG(x) )
|
|
if (CLHS->isExactlyValue(-1.0)) {
|
|
auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
|
|
B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
|
|
.addUse(FNeg.getReg(0))
|
|
.setMIFlags(Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// For f16 require afn or arcp.
|
|
// For f32 require afn.
|
|
if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
|
|
!MI.getFlag(MachineInstr::FmArcp)))
|
|
return false;
|
|
|
|
// x / y -> x * (1.0 / y)
|
|
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
|
|
.addUse(RHS)
|
|
.setMIFlags(Flags);
|
|
B.buildFMul(Res, LHS, RCP, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Res = MI.getOperand(0).getReg();
|
|
Register X = MI.getOperand(1).getReg();
|
|
Register Y = MI.getOperand(2).getReg();
|
|
uint16_t Flags = MI.getFlags();
|
|
LLT ResTy = MRI.getType(Res);
|
|
|
|
const MachineFunction &MF = B.getMF();
|
|
bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
|
|
MI.getFlag(MachineInstr::FmAfn);
|
|
|
|
if (!AllowInaccurateRcp)
|
|
return false;
|
|
|
|
auto NegY = B.buildFNeg(ResTy, Y);
|
|
auto One = B.buildFConstant(ResTy, 1.0);
|
|
|
|
auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
|
|
.addUse(Y)
|
|
.setMIFlags(Flags);
|
|
|
|
auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
|
|
R = B.buildFMA(ResTy, Tmp0, R, R);
|
|
|
|
auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
|
|
R = B.buildFMA(ResTy, Tmp1, R, R);
|
|
|
|
auto Ret = B.buildFMul(ResTy, X, R);
|
|
auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
|
|
|
|
B.buildFMA(Res, Tmp2, R, Ret);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
if (legalizeFastUnsafeFDIV(MI, MRI, B))
|
|
return true;
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
LLT S16 = LLT::scalar(16);
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
|
|
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
|
|
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
|
|
// q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
|
|
// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
|
|
// q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
|
|
// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
|
|
// tmp.u = opx(V_MUL_F32, e32.u, r32.u);
|
|
// tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
|
|
// q32.u = opx(V_ADD_F32, tmp.u, q32.u);
|
|
// q16.u = opx(V_CVT_F16_F32, q32.u);
|
|
// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
|
|
|
|
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
|
|
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
|
|
auto NegRHSExt = B.buildFNeg(S32, RHSExt);
|
|
auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
|
|
.addUse(RHSExt.getReg(0))
|
|
.setMIFlags(Flags);
|
|
auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
|
|
MachineInstrBuilder Err;
|
|
if (ST.hasMadMacF32Insts()) {
|
|
Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
|
|
Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
|
|
Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
|
|
} else {
|
|
Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
|
|
Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
|
|
Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
|
|
}
|
|
auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
|
|
Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
|
|
Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
|
|
auto RDst = B.buildFPTrunc(S16, Quot, Flags);
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
|
|
.addUse(RDst.getReg(0))
|
|
.addUse(RHS)
|
|
.addUse(LHS)
|
|
.setMIFlags(Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static constexpr unsigned SPDenormModeBitField =
|
|
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
|
|
|
|
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
|
|
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
|
|
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
|
|
const GCNSubtarget &ST,
|
|
SIModeRegisterDefaults Mode) {
|
|
// Set SP denorm mode to this value.
|
|
unsigned SPDenormMode =
|
|
Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
|
|
|
|
if (ST.hasDenormModeInst()) {
|
|
// Preserve default FP64FP16 denorm mode while updating FP32 mode.
|
|
uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
|
|
|
|
uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
|
|
B.buildInstr(AMDGPU::S_DENORM_MODE)
|
|
.addImm(NewDenormModeValue);
|
|
|
|
} else {
|
|
B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
|
|
.addImm(SPDenormMode)
|
|
.addImm(SPDenormModeBitField);
|
|
}
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
if (legalizeFastUnsafeFDIV(MI, MRI, B))
|
|
return true;
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
SIModeRegisterDefaults Mode = MFI->getMode();
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
LLT S1 = LLT::scalar(1);
|
|
|
|
auto One = B.buildFConstant(S32, 1.0f);
|
|
|
|
auto DenominatorScaled =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
|
|
.addUse(LHS)
|
|
.addUse(RHS)
|
|
.addImm(0)
|
|
.setMIFlags(Flags);
|
|
auto NumeratorScaled =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
|
|
.addUse(LHS)
|
|
.addUse(RHS)
|
|
.addImm(1)
|
|
.setMIFlags(Flags);
|
|
|
|
auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
|
|
.addUse(DenominatorScaled.getReg(0))
|
|
.setMIFlags(Flags);
|
|
auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
|
|
|
|
const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
|
|
const bool HasDynamicDenormals =
|
|
(Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
|
|
(Mode.FP32Denormals.Output == DenormalMode::Dynamic);
|
|
|
|
Register SavedSPDenormMode;
|
|
if (!PreservesDenormals) {
|
|
if (HasDynamicDenormals) {
|
|
SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
|
B.buildInstr(AMDGPU::S_GETREG_B32)
|
|
.addDef(SavedSPDenormMode)
|
|
.addImm(SPDenormModeBitField);
|
|
}
|
|
toggleSPDenormMode(true, B, ST, Mode);
|
|
}
|
|
|
|
auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
|
|
auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
|
|
auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
|
|
auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
|
|
auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
|
|
auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
|
|
|
|
if (!PreservesDenormals) {
|
|
if (HasDynamicDenormals) {
|
|
assert(SavedSPDenormMode);
|
|
B.buildInstr(AMDGPU::S_SETREG_B32)
|
|
.addReg(SavedSPDenormMode)
|
|
.addImm(SPDenormModeBitField);
|
|
} else
|
|
toggleSPDenormMode(false, B, ST, Mode);
|
|
}
|
|
|
|
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
|
|
.addUse(Fma4.getReg(0))
|
|
.addUse(Fma1.getReg(0))
|
|
.addUse(Fma3.getReg(0))
|
|
.addUse(NumeratorScaled.getReg(1))
|
|
.setMIFlags(Flags);
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
|
|
.addUse(Fmas.getReg(0))
|
|
.addUse(RHS)
|
|
.addUse(LHS)
|
|
.setMIFlags(Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
if (legalizeFastUnsafeFDIV64(MI, MRI, B))
|
|
return true;
|
|
|
|
Register Res = MI.getOperand(0).getReg();
|
|
Register LHS = MI.getOperand(1).getReg();
|
|
Register RHS = MI.getOperand(2).getReg();
|
|
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
LLT S64 = LLT::scalar(64);
|
|
LLT S1 = LLT::scalar(1);
|
|
|
|
auto One = B.buildFConstant(S64, 1.0);
|
|
|
|
auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
|
|
.addUse(LHS)
|
|
.addUse(RHS)
|
|
.addImm(0)
|
|
.setMIFlags(Flags);
|
|
|
|
auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
|
|
|
|
auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
|
|
.addUse(DivScale0.getReg(0))
|
|
.setMIFlags(Flags);
|
|
|
|
auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
|
|
auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
|
|
auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
|
|
|
|
auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
|
|
.addUse(LHS)
|
|
.addUse(RHS)
|
|
.addImm(1)
|
|
.setMIFlags(Flags);
|
|
|
|
auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
|
|
auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
|
|
auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
|
|
|
|
Register Scale;
|
|
if (!ST.hasUsableDivScaleConditionOutput()) {
|
|
// Workaround a hardware bug on SI where the condition output from div_scale
|
|
// is not usable.
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
auto NumUnmerge = B.buildUnmerge(S32, LHS);
|
|
auto DenUnmerge = B.buildUnmerge(S32, RHS);
|
|
auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
|
|
auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
|
|
|
|
auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
|
|
Scale1Unmerge.getReg(1));
|
|
auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
|
|
Scale0Unmerge.getReg(1));
|
|
Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
|
|
} else {
|
|
Scale = DivScale1.getReg(1);
|
|
}
|
|
|
|
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
|
|
.addUse(Fma4.getReg(0))
|
|
.addUse(Fma3.getReg(0))
|
|
.addUse(Mul.getReg(0))
|
|
.addUse(Scale)
|
|
.setMIFlags(Flags);
|
|
|
|
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
|
|
.addUse(Fmas.getReg(0))
|
|
.addUse(RHS)
|
|
.addUse(LHS)
|
|
.setMIFlags(Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Res0 = MI.getOperand(0).getReg();
|
|
Register Res1 = MI.getOperand(1).getReg();
|
|
Register Val = MI.getOperand(2).getReg();
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
LLT Ty = MRI.getType(Res0);
|
|
LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
|
|
|
|
auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
|
|
.addUse(Val)
|
|
.setMIFlags(Flags);
|
|
auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
|
|
.addUse(Val)
|
|
.setMIFlags(Flags);
|
|
|
|
if (ST.hasFractBug()) {
|
|
auto Fabs = B.buildFAbs(Ty, Val);
|
|
auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
|
|
auto IsFinite =
|
|
B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
|
|
auto Zero = B.buildConstant(InstrExpTy, 0);
|
|
Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
|
|
Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
|
|
}
|
|
|
|
B.buildCopy(Res0, Mant);
|
|
B.buildSExtOrTrunc(Res1, Exp);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Res = MI.getOperand(0).getReg();
|
|
Register LHS = MI.getOperand(2).getReg();
|
|
Register RHS = MI.getOperand(3).getReg();
|
|
uint16_t Flags = MI.getFlags();
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
LLT S1 = LLT::scalar(1);
|
|
|
|
auto Abs = B.buildFAbs(S32, RHS, Flags);
|
|
const APFloat C0Val(1.0f);
|
|
|
|
auto C0 = B.buildFConstant(S32, 0x1p+96f);
|
|
auto C1 = B.buildFConstant(S32, 0x1p-32f);
|
|
auto C2 = B.buildFConstant(S32, 1.0f);
|
|
|
|
auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
|
|
auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
|
|
|
|
auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
|
|
|
|
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
|
|
.addUse(Mul0.getReg(0))
|
|
.setMIFlags(Flags);
|
|
|
|
auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
|
|
|
|
B.buildFMul(Res, Sel, Mul1, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
// Bypass the correct expansion a standard promotion through G_FSQRT would
|
|
// get. The f32 op is accurate enough for the f16 cas.
|
|
unsigned Flags = MI.getFlags();
|
|
assert(!ST.has16BitInsts());
|
|
const LLT F32 = LLT::scalar(32);
|
|
auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
|
|
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
|
|
.addUse(Ext.getReg(0))
|
|
.setMIFlags(Flags);
|
|
B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
MachineFunction &MF = B.getMF();
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register X = MI.getOperand(1).getReg();
|
|
const unsigned Flags = MI.getFlags();
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT F32 = LLT::scalar(32);
|
|
const LLT I32 = LLT::scalar(32);
|
|
|
|
if (allowApproxFunc(MF, Flags)) {
|
|
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
|
|
.addUse(X)
|
|
.setMIFlags(Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
|
|
auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
|
|
auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
|
|
auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
|
|
auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
|
|
|
|
Register SqrtS = MRI.createGenericVirtualRegister(F32);
|
|
if (needsDenormHandlingF32(MF, X, Flags)) {
|
|
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
|
|
.addUse(SqrtX.getReg(0))
|
|
.setMIFlags(Flags);
|
|
|
|
auto NegOne = B.buildConstant(I32, -1);
|
|
auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
|
|
|
|
auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
|
|
auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
|
|
|
|
auto PosOne = B.buildConstant(I32, 1);
|
|
auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
|
|
|
|
auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
|
|
auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
|
|
|
|
auto Zero = B.buildFConstant(F32, 0.0f);
|
|
auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
|
|
|
|
SqrtS =
|
|
B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
|
|
|
|
auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
|
|
SqrtS =
|
|
B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
|
|
} else {
|
|
auto SqrtR =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
|
|
B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
|
|
|
|
auto Half = B.buildFConstant(F32, 0.5f);
|
|
auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
|
|
auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
|
|
auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
|
|
SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
|
|
SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
|
|
auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
|
|
auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
|
|
SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
|
|
}
|
|
|
|
auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
|
|
|
|
auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
|
|
|
|
SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
|
|
|
|
auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
|
|
B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
// For double type, the SQRT and RSQ instructions don't have required
|
|
// precision, we apply Goldschmidt's algorithm to improve the result:
|
|
//
|
|
// y0 = rsq(x)
|
|
// g0 = x * y0
|
|
// h0 = 0.5 * y0
|
|
//
|
|
// r0 = 0.5 - h0 * g0
|
|
// g1 = g0 * r0 + g0
|
|
// h1 = h0 * r0 + h0
|
|
//
|
|
// r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
|
|
// g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
|
|
// h2 = h1 * r1 + h1
|
|
//
|
|
// r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
|
|
// g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
|
|
//
|
|
// sqrt(x) = g3
|
|
|
|
const LLT S1 = LLT::scalar(1);
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT F64 = LLT::scalar(64);
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
|
|
|
|
Register X = MI.getOperand(1).getReg();
|
|
unsigned Flags = MI.getFlags();
|
|
|
|
auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
|
|
|
|
auto ZeroInt = B.buildConstant(S32, 0);
|
|
auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
|
|
|
|
// Scale up input if it is too small.
|
|
auto ScaleUpFactor = B.buildConstant(S32, 256);
|
|
auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
|
|
auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
|
|
|
|
auto SqrtY =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
|
|
|
|
auto Half = B.buildFConstant(F64, 0.5);
|
|
auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
|
|
auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
|
|
|
|
auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
|
|
auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
|
|
|
|
auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
|
|
auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
|
|
|
|
auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
|
|
auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
|
|
|
|
auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
|
|
|
|
auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
|
|
auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
|
|
|
|
auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
|
|
|
|
// Scale down the result.
|
|
auto ScaleDownFactor = B.buildConstant(S32, -128);
|
|
auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
|
|
SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
|
|
|
|
// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
|
|
// with finite only or nsz because rsq(+/-0) = +/-inf
|
|
|
|
// TODO: Check for DAZ and expand to subnormals
|
|
auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
|
|
|
|
// If x is +INF, +0, or -0, use its original value
|
|
B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
|
|
if (Ty == LLT::scalar(32))
|
|
return legalizeFSQRTF32(MI, MRI, B);
|
|
if (Ty == LLT::scalar(64))
|
|
return legalizeFSQRTF64(MI, MRI, B);
|
|
if (Ty == LLT::scalar(16))
|
|
return legalizeFSQRTF16(MI, MRI, B);
|
|
return false;
|
|
}
|
|
|
|
// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
|
|
// FIXME: Why do we handle this one but not other removed instructions?
|
|
//
|
|
// Reciprocal square root. The clamp prevents infinite results, clamping
|
|
// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
|
|
// +-max_float.
|
|
bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
|
|
return true;
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
Register Src = MI.getOperand(2).getReg();
|
|
auto Flags = MI.getFlags();
|
|
|
|
LLT Ty = MRI.getType(Dst);
|
|
|
|
const fltSemantics *FltSemantics;
|
|
if (Ty == LLT::scalar(32))
|
|
FltSemantics = &APFloat::IEEEsingle();
|
|
else if (Ty == LLT::scalar(64))
|
|
FltSemantics = &APFloat::IEEEdouble();
|
|
else
|
|
return false;
|
|
|
|
auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
|
|
.addUse(Src)
|
|
.setMIFlags(Flags);
|
|
|
|
// We don't need to concern ourselves with the snan handling difference, since
|
|
// the rsq quieted (or not) so use the one which will directly select.
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
const bool UseIEEE = MFI->getMode().IEEE;
|
|
|
|
auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
|
|
auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
|
|
B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
|
|
|
|
auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
|
|
|
|
if (UseIEEE)
|
|
B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
|
|
else
|
|
B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// TODO: Fix pointer type handling
|
|
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
|
|
MachineInstr &MI,
|
|
Intrinsic::ID IID) const {
|
|
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
|
|
IID == Intrinsic::amdgcn_permlanex16;
|
|
bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
|
|
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
|
|
|
|
auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
|
|
Register Src2, LLT VT) -> Register {
|
|
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
|
|
switch (IID) {
|
|
case Intrinsic::amdgcn_readfirstlane:
|
|
case Intrinsic::amdgcn_permlane64:
|
|
return LaneOp.getReg(0);
|
|
case Intrinsic::amdgcn_readlane:
|
|
case Intrinsic::amdgcn_set_inactive:
|
|
case Intrinsic::amdgcn_set_inactive_chain_arg:
|
|
return LaneOp.addUse(Src1).getReg(0);
|
|
case Intrinsic::amdgcn_writelane:
|
|
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
|
|
case Intrinsic::amdgcn_permlane16:
|
|
case Intrinsic::amdgcn_permlanex16: {
|
|
Register Src3 = MI.getOperand(5).getReg();
|
|
Register Src4 = MI.getOperand(6).getImm();
|
|
Register Src5 = MI.getOperand(7).getImm();
|
|
return LaneOp.addUse(Src1)
|
|
.addUse(Src2)
|
|
.addUse(Src3)
|
|
.addImm(Src4)
|
|
.addImm(Src5)
|
|
.getReg(0);
|
|
}
|
|
case Intrinsic::amdgcn_mov_dpp8:
|
|
return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
|
|
case Intrinsic::amdgcn_update_dpp:
|
|
return LaneOp.addUse(Src1)
|
|
.addImm(MI.getOperand(4).getImm())
|
|
.addImm(MI.getOperand(5).getImm())
|
|
.addImm(MI.getOperand(6).getImm())
|
|
.addImm(MI.getOperand(7).getImm())
|
|
.getReg(0);
|
|
default:
|
|
llvm_unreachable("unhandled lane op");
|
|
}
|
|
};
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register Src0 = MI.getOperand(2).getReg();
|
|
Register Src1, Src2;
|
|
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
|
|
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
|
|
Src1 = MI.getOperand(3).getReg();
|
|
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
|
|
Src2 = MI.getOperand(4).getReg();
|
|
}
|
|
}
|
|
|
|
LLT Ty = MRI.getType(DstReg);
|
|
unsigned Size = Ty.getSizeInBits();
|
|
|
|
unsigned SplitSize = 32;
|
|
if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
|
|
ST.hasDPALU_DPP() &&
|
|
AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
|
|
SplitSize = 64;
|
|
|
|
if (Size == SplitSize) {
|
|
// Already legal
|
|
return true;
|
|
}
|
|
|
|
if (Size < 32) {
|
|
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
|
|
|
|
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
|
|
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
|
|
|
|
if (IID == Intrinsic::amdgcn_writelane)
|
|
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
|
|
|
|
Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
|
|
B.buildTrunc(DstReg, LaneOpDst);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
if (Size % SplitSize != 0)
|
|
return false;
|
|
|
|
LLT PartialResTy = LLT::scalar(SplitSize);
|
|
if (Ty.isVector()) {
|
|
LLT EltTy = Ty.getElementType();
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
if (EltSize == SplitSize) {
|
|
PartialResTy = EltTy;
|
|
} else if (EltSize == 16 || EltSize == 32) {
|
|
unsigned NElem = SplitSize / EltSize;
|
|
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
|
|
}
|
|
// Handle all other cases via S32/S64 pieces;
|
|
}
|
|
|
|
SmallVector<Register, 4> PartialRes;
|
|
unsigned NumParts = Size / SplitSize;
|
|
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
|
|
MachineInstrBuilder Src1Parts, Src2Parts;
|
|
|
|
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
|
|
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
|
|
|
|
if (IID == Intrinsic::amdgcn_writelane)
|
|
Src2Parts = B.buildUnmerge(PartialResTy, Src2);
|
|
|
|
for (unsigned i = 0; i < NumParts; ++i) {
|
|
Src0 = Src0Parts.getReg(i);
|
|
|
|
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
|
|
Src1 = Src1Parts.getReg(i);
|
|
|
|
if (IID == Intrinsic::amdgcn_writelane)
|
|
Src2 = Src2Parts.getReg(i);
|
|
|
|
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
|
|
}
|
|
|
|
B.buildMergeLikeInstr(DstReg, PartialRes);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
uint64_t Offset =
|
|
ST.getTargetLowering()->getImplicitParameterOffset(
|
|
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
|
|
LLT DstTy = MRI.getType(DstReg);
|
|
LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
|
|
|
|
Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
|
|
if (!loadInputValue(KernargPtrReg, B,
|
|
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
|
|
return false;
|
|
|
|
// FIXME: This should be nuw
|
|
B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
|
|
return true;
|
|
}
|
|
|
|
/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
|
|
/// bits of the pointer and replace them with the stride argument, then
|
|
/// merge_values everything together. In the common case of a raw buffer (the
|
|
/// stride component is 0), we can just AND off the upper half.
|
|
bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
Register Result = MI.getOperand(0).getReg();
|
|
Register Pointer = MI.getOperand(2).getReg();
|
|
Register Stride = MI.getOperand(3).getReg();
|
|
Register NumRecords = MI.getOperand(4).getReg();
|
|
Register Flags = MI.getOperand(5).getReg();
|
|
|
|
LLT S32 = LLT::scalar(32);
|
|
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
|
auto Unmerge = B.buildUnmerge(S32, Pointer);
|
|
Register LowHalf = Unmerge.getReg(0);
|
|
Register HighHalf = Unmerge.getReg(1);
|
|
|
|
auto AndMask = B.buildConstant(S32, 0x0000ffff);
|
|
auto Masked = B.buildAnd(S32, HighHalf, AndMask);
|
|
|
|
MachineInstrBuilder NewHighHalf = Masked;
|
|
std::optional<ValueAndVReg> StrideConst =
|
|
getIConstantVRegValWithLookThrough(Stride, MRI);
|
|
if (!StrideConst || !StrideConst->Value.isZero()) {
|
|
MachineInstrBuilder ShiftedStride;
|
|
if (StrideConst) {
|
|
uint32_t StrideVal = StrideConst->Value.getZExtValue();
|
|
uint32_t ShiftedStrideVal = StrideVal << 16;
|
|
ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
|
|
} else {
|
|
auto ExtStride = B.buildAnyExt(S32, Stride);
|
|
auto ShiftConst = B.buildConstant(S32, 16);
|
|
ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
|
|
}
|
|
NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
|
|
}
|
|
Register NewHighHalfReg = NewHighHalf.getReg(0);
|
|
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
if (!MFI->isEntryFunction()) {
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
|
|
}
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
if (!getImplicitArgPtr(DstReg, MRI, B))
|
|
return false;
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Function &F = B.getMF().getFunction();
|
|
std::optional<uint32_t> KnownSize =
|
|
AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
|
|
if (KnownSize.has_value())
|
|
B.buildConstant(DstReg, *KnownSize);
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
|
|
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
|
|
if (!MFI->isEntryFunction()) {
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
|
|
}
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
if (!getLDSKernelId(DstReg, MRI, B))
|
|
return false;
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B,
|
|
unsigned AddrSpace) const {
|
|
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
|
|
auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
|
|
Register Hi32 = Unmerge.getReg(1);
|
|
|
|
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
|
|
// offset (the offset that is included in bounds checking and swizzling, to be
|
|
// split between the instruction's voffset and immoffset fields) and soffset
|
|
// (the offset that is excluded from bounds checking and swizzling, to go in
|
|
// the instruction's soffset field). This function takes the first kind of
|
|
// offset and figures out how to split it between voffset and immoffset.
|
|
std::pair<Register, unsigned>
|
|
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
|
|
Register OrigOffset) const {
|
|
const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
|
|
Register BaseReg;
|
|
unsigned ImmOffset;
|
|
const LLT S32 = LLT::scalar(32);
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
std::tie(BaseReg, ImmOffset) =
|
|
AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
|
|
|
|
// If BaseReg is a pointer, convert it to int.
|
|
if (MRI.getType(BaseReg).isPointer())
|
|
BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
|
|
|
|
// If the immediate value is too big for the immoffset field, put only bits
|
|
// that would normally fit in the immoffset field. The remaining value that
|
|
// is copied/added for the voffset field is a large power of 2, and it
|
|
// stands more chance of being CSEd with the copy/add for another similar
|
|
// load/store.
|
|
// However, do not do that rounding down if that is a negative
|
|
// number, as it appears to be illegal to have a negative offset in the
|
|
// vgpr, even if adding the immediate offset makes it positive.
|
|
unsigned Overflow = ImmOffset & ~MaxImm;
|
|
ImmOffset -= Overflow;
|
|
if ((int32_t)Overflow < 0) {
|
|
Overflow += ImmOffset;
|
|
ImmOffset = 0;
|
|
}
|
|
|
|
if (Overflow != 0) {
|
|
if (!BaseReg) {
|
|
BaseReg = B.buildConstant(S32, Overflow).getReg(0);
|
|
} else {
|
|
auto OverflowVal = B.buildConstant(S32, Overflow);
|
|
BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
|
|
}
|
|
}
|
|
|
|
if (!BaseReg)
|
|
BaseReg = B.buildConstant(S32, 0).getReg(0);
|
|
|
|
return std::pair(BaseReg, ImmOffset);
|
|
}
|
|
|
|
/// Handle register layout difference for f16 images for some subtargets.
|
|
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
|
|
MachineRegisterInfo &MRI,
|
|
Register Reg,
|
|
bool ImageStore) const {
|
|
const LLT S16 = LLT::scalar(16);
|
|
const LLT S32 = LLT::scalar(32);
|
|
LLT StoreVT = MRI.getType(Reg);
|
|
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
|
|
|
|
if (ST.hasUnpackedD16VMem()) {
|
|
auto Unmerge = B.buildUnmerge(S16, Reg);
|
|
|
|
SmallVector<Register, 4> WideRegs;
|
|
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
|
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
|
|
|
|
int NumElts = StoreVT.getNumElements();
|
|
|
|
return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
|
|
.getReg(0);
|
|
}
|
|
|
|
if (ImageStore && ST.hasImageStoreD16Bug()) {
|
|
if (StoreVT.getNumElements() == 2) {
|
|
SmallVector<Register, 4> PackedRegs;
|
|
Reg = B.buildBitcast(S32, Reg).getReg(0);
|
|
PackedRegs.push_back(Reg);
|
|
PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
|
|
return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
|
|
.getReg(0);
|
|
}
|
|
|
|
if (StoreVT.getNumElements() == 3) {
|
|
SmallVector<Register, 4> PackedRegs;
|
|
auto Unmerge = B.buildUnmerge(S16, Reg);
|
|
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
|
PackedRegs.push_back(Unmerge.getReg(I));
|
|
PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
|
|
Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
|
|
return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
|
|
}
|
|
|
|
if (StoreVT.getNumElements() == 4) {
|
|
SmallVector<Register, 4> PackedRegs;
|
|
Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
|
|
auto Unmerge = B.buildUnmerge(S32, Reg);
|
|
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
|
PackedRegs.push_back(Unmerge.getReg(I));
|
|
PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
|
|
return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
|
|
.getReg(0);
|
|
}
|
|
|
|
llvm_unreachable("invalid data type");
|
|
}
|
|
|
|
if (StoreVT == LLT::fixed_vector(3, S16)) {
|
|
Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
|
|
.getReg(0);
|
|
}
|
|
return Reg;
|
|
}
|
|
|
|
Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
|
|
Register VData, LLT MemTy,
|
|
bool IsFormat) const {
|
|
MachineRegisterInfo *MRI = B.getMRI();
|
|
LLT Ty = MRI->getType(VData);
|
|
|
|
const LLT S16 = LLT::scalar(16);
|
|
|
|
// Fixup buffer resources themselves needing to be v4i128.
|
|
if (hasBufferRsrcWorkaround(Ty))
|
|
return castBufferRsrcToV4I32(VData, B);
|
|
|
|
if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
|
|
Ty = getBitcastRegisterType(Ty);
|
|
VData = B.buildBitcast(Ty, VData).getReg(0);
|
|
}
|
|
// Fixup illegal register types for i8 stores.
|
|
if (Ty == LLT::scalar(8) || Ty == S16) {
|
|
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
|
|
return AnyExt;
|
|
}
|
|
|
|
if (Ty.isVector()) {
|
|
if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
|
|
if (IsFormat)
|
|
return handleD16VData(B, *MRI, VData);
|
|
}
|
|
}
|
|
|
|
return VData;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
|
|
LegalizerHelper &Helper,
|
|
bool IsTyped,
|
|
bool IsFormat) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
Register VData = MI.getOperand(1).getReg();
|
|
LLT Ty = MRI.getType(VData);
|
|
LLT EltTy = Ty.getScalarType();
|
|
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
const int MemSize = MMO->getSize().getValue();
|
|
LLT MemTy = MMO->getMemoryType();
|
|
|
|
VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
|
|
|
|
castBufferRsrcArgToV4I32(MI, B, 2);
|
|
Register RSrc = MI.getOperand(2).getReg();
|
|
|
|
unsigned ImmOffset;
|
|
|
|
// The typed intrinsics add an immediate after the registers.
|
|
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
|
|
|
|
// The struct intrinsic variants add one additional operand over raw.
|
|
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
|
|
Register VIndex;
|
|
int OpOffset = 0;
|
|
if (HasVIndex) {
|
|
VIndex = MI.getOperand(3).getReg();
|
|
OpOffset = 1;
|
|
} else {
|
|
VIndex = B.buildConstant(S32, 0).getReg(0);
|
|
}
|
|
|
|
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
|
|
Register SOffset = MI.getOperand(4 + OpOffset).getReg();
|
|
|
|
unsigned Format = 0;
|
|
if (IsTyped) {
|
|
Format = MI.getOperand(5 + OpOffset).getImm();
|
|
++OpOffset;
|
|
}
|
|
|
|
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
|
|
|
|
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
|
|
|
|
unsigned Opc;
|
|
if (IsTyped) {
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
|
|
AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
|
|
} else if (IsFormat) {
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
|
|
AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
|
|
} else {
|
|
switch (MemSize) {
|
|
case 1:
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
|
|
break;
|
|
case 2:
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
|
|
break;
|
|
default:
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
auto MIB = B.buildInstr(Opc)
|
|
.addUse(VData) // vdata
|
|
.addUse(RSrc) // rsrc
|
|
.addUse(VIndex) // vindex
|
|
.addUse(VOffset) // voffset
|
|
.addUse(SOffset) // soffset
|
|
.addImm(ImmOffset); // offset(imm)
|
|
|
|
if (IsTyped)
|
|
MIB.addImm(Format);
|
|
|
|
MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
|
|
.addImm(HasVIndex ? -1 : 0) // idxen(imm)
|
|
.addMemOperand(MMO);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
|
|
Register VIndex, Register VOffset, Register SOffset,
|
|
unsigned ImmOffset, unsigned Format,
|
|
unsigned AuxiliaryData, MachineMemOperand *MMO,
|
|
bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
|
|
auto MIB = B.buildInstr(Opc)
|
|
.addDef(LoadDstReg) // vdata
|
|
.addUse(RSrc) // rsrc
|
|
.addUse(VIndex) // vindex
|
|
.addUse(VOffset) // voffset
|
|
.addUse(SOffset) // soffset
|
|
.addImm(ImmOffset); // offset(imm)
|
|
|
|
if (IsTyped)
|
|
MIB.addImm(Format);
|
|
|
|
MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
|
|
.addImm(HasVIndex ? -1 : 0) // idxen(imm)
|
|
.addMemOperand(MMO);
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
|
|
LegalizerHelper &Helper,
|
|
bool IsFormat,
|
|
bool IsTyped) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
GISelChangeObserver &Observer = Helper.Observer;
|
|
|
|
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
const LLT MemTy = MMO->getMemoryType();
|
|
const LLT S32 = LLT::scalar(32);
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
|
|
Register StatusDst;
|
|
int OpOffset = 0;
|
|
assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
|
|
bool IsTFE = MI.getNumExplicitDefs() == 2;
|
|
if (IsTFE) {
|
|
StatusDst = MI.getOperand(1).getReg();
|
|
++OpOffset;
|
|
}
|
|
|
|
castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
|
|
Register RSrc = MI.getOperand(2 + OpOffset).getReg();
|
|
|
|
// The typed intrinsics add an immediate after the registers.
|
|
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
|
|
|
|
// The struct intrinsic variants add one additional operand over raw.
|
|
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
|
|
Register VIndex;
|
|
if (HasVIndex) {
|
|
VIndex = MI.getOperand(3 + OpOffset).getReg();
|
|
++OpOffset;
|
|
} else {
|
|
VIndex = B.buildConstant(S32, 0).getReg(0);
|
|
}
|
|
|
|
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
|
|
Register SOffset = MI.getOperand(4 + OpOffset).getReg();
|
|
|
|
unsigned Format = 0;
|
|
if (IsTyped) {
|
|
Format = MI.getOperand(5 + OpOffset).getImm();
|
|
++OpOffset;
|
|
}
|
|
|
|
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
|
|
unsigned ImmOffset;
|
|
|
|
LLT Ty = MRI.getType(Dst);
|
|
// Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
|
|
// logic doesn't have to handle that case.
|
|
if (hasBufferRsrcWorkaround(Ty)) {
|
|
Observer.changingInstr(MI);
|
|
Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
|
|
Observer.changedInstr(MI);
|
|
Dst = MI.getOperand(0).getReg();
|
|
B.setInsertPt(B.getMBB(), MI);
|
|
}
|
|
if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
|
|
Ty = getBitcastRegisterType(Ty);
|
|
Observer.changingInstr(MI);
|
|
Helper.bitcastDst(MI, Ty, 0);
|
|
Observer.changedInstr(MI);
|
|
Dst = MI.getOperand(0).getReg();
|
|
B.setInsertPt(B.getMBB(), MI);
|
|
}
|
|
|
|
LLT EltTy = Ty.getScalarType();
|
|
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
|
|
const bool Unpacked = ST.hasUnpackedD16VMem();
|
|
|
|
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
|
|
|
|
unsigned Opc;
|
|
|
|
// TODO: Support TFE for typed and narrow loads.
|
|
if (IsTyped) {
|
|
if (IsTFE)
|
|
return false;
|
|
Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
|
|
AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
|
|
} else if (IsFormat) {
|
|
if (IsD16) {
|
|
if (IsTFE)
|
|
return false;
|
|
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
|
|
} else {
|
|
Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
|
|
: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
|
|
}
|
|
} else {
|
|
switch (MemTy.getSizeInBits()) {
|
|
case 8:
|
|
Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
|
|
: AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
|
|
break;
|
|
case 16:
|
|
Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
|
|
: AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
|
|
break;
|
|
default:
|
|
Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
|
|
: AMDGPU::G_AMDGPU_BUFFER_LOAD;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (IsTFE) {
|
|
unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
|
|
unsigned NumLoadDWords = NumValueDWords + 1;
|
|
LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
|
|
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
|
|
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
|
|
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
|
|
if (MemTy.getSizeInBits() < 32) {
|
|
Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
|
|
B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
|
|
B.buildTrunc(Dst, ExtDst);
|
|
} else if (NumValueDWords == 1) {
|
|
B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
|
|
} else {
|
|
SmallVector<Register, 5> LoadElts;
|
|
for (unsigned I = 0; I != NumValueDWords; ++I)
|
|
LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
|
|
LoadElts.push_back(StatusDst);
|
|
B.buildUnmerge(LoadElts, LoadDstReg);
|
|
LoadElts.truncate(NumValueDWords);
|
|
B.buildMergeLikeInstr(Dst, LoadElts);
|
|
}
|
|
} else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
|
|
(IsD16 && !Ty.isVector())) {
|
|
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
|
|
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
|
|
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
|
B.buildTrunc(Dst, LoadDstReg);
|
|
} else if (Unpacked && IsD16 && Ty.isVector()) {
|
|
LLT UnpackedTy = Ty.changeElementSize(32);
|
|
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
|
|
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
|
|
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
|
// FIXME: G_TRUNC should work, but legalization currently fails
|
|
auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
|
|
SmallVector<Register, 4> Repack;
|
|
for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
|
|
Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
|
|
B.buildMergeLikeInstr(Dst, Repack);
|
|
} else {
|
|
buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
|
|
AuxiliaryData, MMO, IsTyped, HasVIndex, B);
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
|
|
switch (IntrID) {
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
|
|
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
|
|
default:
|
|
llvm_unreachable("unhandled atomic opcode");
|
|
}
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
|
|
MachineIRBuilder &B,
|
|
Intrinsic::ID IID) const {
|
|
const bool IsCmpSwap =
|
|
IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
|
|
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
|
|
IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
|
|
IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
|
|
|
|
Register Dst = MI.getOperand(0).getReg();
|
|
// Since we don't have 128-bit atomics, we don't need to handle the case of
|
|
// p8 argmunents to the atomic itself
|
|
Register VData = MI.getOperand(2).getReg();
|
|
|
|
Register CmpVal;
|
|
int OpOffset = 0;
|
|
|
|
if (IsCmpSwap) {
|
|
CmpVal = MI.getOperand(3).getReg();
|
|
++OpOffset;
|
|
}
|
|
|
|
castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
|
|
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
|
|
const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
|
|
|
|
// The struct intrinsic variants add one additional operand over raw.
|
|
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
|
|
Register VIndex;
|
|
if (HasVIndex) {
|
|
VIndex = MI.getOperand(4 + OpOffset).getReg();
|
|
++OpOffset;
|
|
} else {
|
|
VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
|
|
}
|
|
|
|
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
|
|
Register SOffset = MI.getOperand(5 + OpOffset).getReg();
|
|
unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
|
|
|
|
MachineMemOperand *MMO = *MI.memoperands_begin();
|
|
|
|
unsigned ImmOffset;
|
|
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
|
|
|
|
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
|
|
.addDef(Dst)
|
|
.addUse(VData); // vdata
|
|
|
|
if (IsCmpSwap)
|
|
MIB.addReg(CmpVal);
|
|
|
|
MIB.addUse(RSrc) // rsrc
|
|
.addUse(VIndex) // vindex
|
|
.addUse(VOffset) // voffset
|
|
.addUse(SOffset) // soffset
|
|
.addImm(ImmOffset) // offset(imm)
|
|
.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
|
|
.addImm(HasVIndex ? -1 : 0) // idxen(imm)
|
|
.addMemOperand(MMO);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
|
|
/// vector with s16 typed elements.
|
|
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
|
|
SmallVectorImpl<Register> &PackedAddrs,
|
|
unsigned ArgOffset,
|
|
const AMDGPU::ImageDimIntrinsicInfo *Intr,
|
|
bool IsA16, bool IsG16) {
|
|
const LLT S16 = LLT::scalar(16);
|
|
const LLT V2S16 = LLT::fixed_vector(2, 16);
|
|
auto EndIdx = Intr->VAddrEnd;
|
|
|
|
for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
|
|
MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
|
|
if (!SrcOp.isReg())
|
|
continue; // _L to _LZ may have eliminated this.
|
|
|
|
Register AddrReg = SrcOp.getReg();
|
|
|
|
if ((I < Intr->GradientStart) ||
|
|
(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
|
|
(I >= Intr->CoordStart && !IsA16)) {
|
|
if ((I < Intr->GradientStart) && IsA16 &&
|
|
(B.getMRI()->getType(AddrReg) == S16)) {
|
|
assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
|
|
// Special handling of bias when A16 is on. Bias is of type half but
|
|
// occupies full 32-bit.
|
|
PackedAddrs.push_back(
|
|
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
|
|
.getReg(0));
|
|
} else {
|
|
assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
|
|
"Bias needs to be converted to 16 bit in A16 mode");
|
|
// Handle any gradient or coordinate operands that should not be packed
|
|
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
|
|
PackedAddrs.push_back(AddrReg);
|
|
}
|
|
} else {
|
|
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
|
|
// derivatives dx/dh and dx/dv are packed with undef.
|
|
if (((I + 1) >= EndIdx) ||
|
|
((Intr->NumGradients / 2) % 2 == 1 &&
|
|
(I == static_cast<unsigned>(Intr->GradientStart +
|
|
(Intr->NumGradients / 2) - 1) ||
|
|
I == static_cast<unsigned>(Intr->GradientStart +
|
|
Intr->NumGradients - 1))) ||
|
|
// Check for _L to _LZ optimization
|
|
!MI.getOperand(ArgOffset + I + 1).isReg()) {
|
|
PackedAddrs.push_back(
|
|
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
|
|
.getReg(0));
|
|
} else {
|
|
PackedAddrs.push_back(
|
|
B.buildBuildVector(
|
|
V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
|
|
.getReg(0));
|
|
++I;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Convert from separate vaddr components to a single vector address register,
|
|
/// and replace the remaining operands with $noreg.
|
|
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
|
|
int DimIdx, int NumVAddrs) {
|
|
const LLT S32 = LLT::scalar(32);
|
|
(void)S32;
|
|
SmallVector<Register, 8> AddrRegs;
|
|
for (int I = 0; I != NumVAddrs; ++I) {
|
|
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
|
|
if (SrcOp.isReg()) {
|
|
AddrRegs.push_back(SrcOp.getReg());
|
|
assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
|
|
}
|
|
}
|
|
|
|
int NumAddrRegs = AddrRegs.size();
|
|
if (NumAddrRegs != 1) {
|
|
auto VAddr =
|
|
B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
|
|
MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
|
|
}
|
|
|
|
for (int I = 1; I != NumVAddrs; ++I) {
|
|
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
|
|
if (SrcOp.isReg())
|
|
MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
|
|
}
|
|
}
|
|
|
|
/// Rewrite image intrinsics to use register layouts expected by the subtarget.
|
|
///
|
|
/// Depending on the subtarget, load/store with 16-bit element data need to be
|
|
/// rewritten to use the low half of 32-bit registers, or directly use a packed
|
|
/// layout. 16-bit addresses should also sometimes be packed into 32-bit
|
|
/// registers.
|
|
///
|
|
/// We don't want to directly select image instructions just yet, but also want
|
|
/// to exposes all register repacking to the legalizer/combiners. We also don't
|
|
/// want a selected instruction entering RegBankSelect. In order to avoid
|
|
/// defining a multitude of intermediate image instructions, directly hack on
|
|
/// the intrinsic's arguments. In cases like a16 addresses, this requires
|
|
/// padding now unnecessary arguments with $noreg.
|
|
bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
|
|
MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
|
|
const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
|
|
|
|
const MachineFunction &MF = *MI.getMF();
|
|
const unsigned NumDefs = MI.getNumExplicitDefs();
|
|
const unsigned ArgOffset = NumDefs + 1;
|
|
bool IsTFE = NumDefs == 2;
|
|
// We are only processing the operands of d16 image operations on subtargets
|
|
// that use the unpacked register layout, or need to repack the TFE result.
|
|
|
|
// TODO: Do we need to guard against already legalized intrinsics?
|
|
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
|
|
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
|
|
|
|
MachineRegisterInfo *MRI = B.getMRI();
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT S16 = LLT::scalar(16);
|
|
const LLT V2S16 = LLT::fixed_vector(2, 16);
|
|
|
|
unsigned DMask = 0;
|
|
Register VData;
|
|
LLT Ty;
|
|
|
|
if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
|
|
VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
|
|
Ty = MRI->getType(VData);
|
|
}
|
|
|
|
const bool IsAtomicPacked16Bit =
|
|
(BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
|
|
BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
|
|
|
|
// Check for 16 bit addresses and pack if true.
|
|
LLT GradTy =
|
|
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
|
|
LLT AddrTy =
|
|
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
|
|
const bool IsG16 =
|
|
ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
|
|
const bool IsA16 = AddrTy == S16;
|
|
const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
|
|
|
|
int DMaskLanes = 0;
|
|
if (!BaseOpcode->Atomic) {
|
|
DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
|
|
if (BaseOpcode->Gather4) {
|
|
DMaskLanes = 4;
|
|
} else if (DMask != 0) {
|
|
DMaskLanes = llvm::popcount(DMask);
|
|
} else if (!IsTFE && !BaseOpcode->Store) {
|
|
// If dmask is 0, this is a no-op load. This can be eliminated.
|
|
B.buildUndef(MI.getOperand(0));
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
}
|
|
|
|
Observer.changingInstr(MI);
|
|
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
|
|
|
|
const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
|
|
: AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
|
|
const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
|
|
: AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
|
|
unsigned NewOpcode = LoadOpcode;
|
|
if (BaseOpcode->Store)
|
|
NewOpcode = StoreOpcode;
|
|
else if (BaseOpcode->NoReturn)
|
|
NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
|
|
|
|
// Track that we legalized this
|
|
MI.setDesc(B.getTII().get(NewOpcode));
|
|
|
|
// Expecting to get an error flag since TFC is on - and dmask is 0 Force
|
|
// dmask to be at least 1 otherwise the instruction will fail
|
|
if (IsTFE && DMask == 0) {
|
|
DMask = 0x1;
|
|
DMaskLanes = 1;
|
|
MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
|
|
}
|
|
|
|
if (BaseOpcode->Atomic) {
|
|
Register VData0 = MI.getOperand(2).getReg();
|
|
LLT Ty = MRI->getType(VData0);
|
|
|
|
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
|
|
if (Ty.isVector() && !IsAtomicPacked16Bit)
|
|
return false;
|
|
|
|
if (BaseOpcode->AtomicX2) {
|
|
Register VData1 = MI.getOperand(3).getReg();
|
|
// The two values are packed in one register.
|
|
LLT PackedTy = LLT::fixed_vector(2, Ty);
|
|
auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
|
|
MI.getOperand(2).setReg(Concat.getReg(0));
|
|
MI.getOperand(3).setReg(AMDGPU::NoRegister);
|
|
}
|
|
}
|
|
|
|
unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
|
|
|
|
// Rewrite the addressing register layout before doing anything else.
|
|
if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
|
|
// 16 bit gradients are supported, but are tied to the A16 control
|
|
// so both gradients and addresses must be 16 bit
|
|
return false;
|
|
}
|
|
|
|
if (IsA16 && !ST.hasA16()) {
|
|
// A16 not supported
|
|
return false;
|
|
}
|
|
|
|
const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
|
|
const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
|
|
|
|
if (IsA16 || IsG16) {
|
|
// Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
|
|
// instructions expect VGPR_32
|
|
SmallVector<Register, 4> PackedRegs;
|
|
|
|
packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
|
|
|
|
// See also below in the non-a16 branch
|
|
const bool UseNSA = ST.hasNSAEncoding() &&
|
|
PackedRegs.size() >= ST.getNSAThreshold(MF) &&
|
|
(PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
|
|
const bool UsePartialNSA =
|
|
UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
|
|
|
|
if (UsePartialNSA) {
|
|
// Pack registers that would go over NSAMaxSize into last VAddr register
|
|
LLT PackedAddrTy =
|
|
LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
|
|
auto Concat = B.buildConcatVectors(
|
|
PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
|
|
PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
|
|
PackedRegs.resize(NSAMaxSize);
|
|
} else if (!UseNSA && PackedRegs.size() > 1) {
|
|
LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
|
|
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
|
|
PackedRegs[0] = Concat.getReg(0);
|
|
PackedRegs.resize(1);
|
|
}
|
|
|
|
const unsigned NumPacked = PackedRegs.size();
|
|
for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
|
|
MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
|
|
if (!SrcOp.isReg()) {
|
|
assert(SrcOp.isImm() && SrcOp.getImm() == 0);
|
|
continue;
|
|
}
|
|
|
|
assert(SrcOp.getReg() != AMDGPU::NoRegister);
|
|
|
|
if (I - Intr->VAddrStart < NumPacked)
|
|
SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
|
|
else
|
|
SrcOp.setReg(AMDGPU::NoRegister);
|
|
}
|
|
} else {
|
|
// If the register allocator cannot place the address registers contiguously
|
|
// without introducing moves, then using the non-sequential address encoding
|
|
// is always preferable, since it saves VALU instructions and is usually a
|
|
// wash in terms of code size or even better.
|
|
//
|
|
// However, we currently have no way of hinting to the register allocator
|
|
// that MIMG addresses should be placed contiguously when it is possible to
|
|
// do so, so force non-NSA for the common 2-address case as a heuristic.
|
|
//
|
|
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
|
|
// allocation when possible.
|
|
//
|
|
// Partial NSA is allowed on GFX11+ where the final register is a contiguous
|
|
// set of the remaining addresses.
|
|
const bool UseNSA = ST.hasNSAEncoding() &&
|
|
CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
|
|
(CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
|
|
const bool UsePartialNSA =
|
|
UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
|
|
|
|
if (UsePartialNSA) {
|
|
convertImageAddrToPacked(B, MI,
|
|
ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
|
|
Intr->NumVAddrs - NSAMaxSize + 1);
|
|
} else if (!UseNSA && Intr->NumVAddrs > 1) {
|
|
convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
|
|
Intr->NumVAddrs);
|
|
}
|
|
}
|
|
|
|
int Flags = 0;
|
|
if (IsA16)
|
|
Flags |= 1;
|
|
if (IsG16)
|
|
Flags |= 2;
|
|
MI.addOperand(MachineOperand::CreateImm(Flags));
|
|
|
|
if (BaseOpcode->NoReturn) { // No TFE for stores?
|
|
// TODO: Handle dmask trim
|
|
if (!Ty.isVector() || !IsD16)
|
|
return true;
|
|
|
|
Register RepackedReg = handleD16VData(B, *MRI, VData, true);
|
|
if (RepackedReg != VData) {
|
|
MI.getOperand(1).setReg(RepackedReg);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
const LLT EltTy = Ty.getScalarType();
|
|
const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
|
|
|
|
// Confirm that the return type is large enough for the dmask specified
|
|
if (NumElts < DMaskLanes)
|
|
return false;
|
|
|
|
if (NumElts > 4 || DMaskLanes > 4)
|
|
return false;
|
|
|
|
// Image atomic instructions are using DMask to specify how many bits
|
|
// input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
|
|
// DMaskLanes for image atomic has default value '0'.
|
|
// We must be sure that atomic variants (especially packed) will not be
|
|
// truncated from v2s16 or v4s16 to s16 type.
|
|
//
|
|
// ChangeElementCount will be needed for image load where Ty is always scalar.
|
|
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
|
|
const LLT AdjustedTy =
|
|
DMaskLanes == 0
|
|
? Ty
|
|
: Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
|
|
|
|
// The raw dword aligned data component of the load. The only legal cases
|
|
// where this matters should be when using the packed D16 format, for
|
|
// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
|
|
LLT RoundedTy;
|
|
|
|
// S32 vector to cover all data, plus TFE result element.
|
|
LLT TFETy;
|
|
|
|
// Register type to use for each loaded component. Will be S32 or V2S16.
|
|
LLT RegTy;
|
|
|
|
if (IsD16 && ST.hasUnpackedD16VMem()) {
|
|
RoundedTy =
|
|
LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
|
|
TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
|
|
RegTy = S32;
|
|
} else {
|
|
unsigned EltSize = EltTy.getSizeInBits();
|
|
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
|
|
unsigned RoundedSize = 32 * RoundedElts;
|
|
RoundedTy = LLT::scalarOrVector(
|
|
ElementCount::getFixed(RoundedSize / EltSize), EltSize);
|
|
TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
|
|
RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
|
|
}
|
|
|
|
// The return type does not need adjustment.
|
|
// TODO: Should we change s16 case to s32 or <2 x s16>?
|
|
if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
|
|
return true;
|
|
|
|
Register Dst1Reg;
|
|
|
|
// Insert after the instruction.
|
|
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
|
|
|
|
// TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
|
|
// s16> instead of s32, we would only need 1 bitcast instead of multiple.
|
|
const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
|
|
const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
|
|
|
|
Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
|
|
|
|
MI.getOperand(0).setReg(NewResultReg);
|
|
|
|
// In the IR, TFE is supposed to be used with a 2 element struct return
|
|
// type. The instruction really returns these two values in one contiguous
|
|
// register, with one additional dword beyond the loaded data. Rewrite the
|
|
// return type to use a single register result.
|
|
|
|
if (IsTFE) {
|
|
Dst1Reg = MI.getOperand(1).getReg();
|
|
if (MRI->getType(Dst1Reg) != S32)
|
|
return false;
|
|
|
|
// TODO: Make sure the TFE operand bit is set.
|
|
MI.removeOperand(1);
|
|
|
|
// Handle the easy case that requires no repack instructions.
|
|
if (Ty == S32) {
|
|
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Now figure out how to copy the new result register back into the old
|
|
// result.
|
|
SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
|
|
|
|
const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
|
|
|
|
if (ResultNumRegs == 1) {
|
|
assert(!IsTFE);
|
|
ResultRegs[0] = NewResultReg;
|
|
} else {
|
|
// We have to repack into a new vector of some kind.
|
|
for (int I = 0; I != NumDataRegs; ++I)
|
|
ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
|
|
B.buildUnmerge(ResultRegs, NewResultReg);
|
|
|
|
// Drop the final TFE element to get the data part. The TFE result is
|
|
// directly written to the right place already.
|
|
if (IsTFE)
|
|
ResultRegs.resize(NumDataRegs);
|
|
}
|
|
|
|
// For an s16 scalar result, we form an s32 result with a truncate regardless
|
|
// of packed vs. unpacked.
|
|
if (IsD16 && !Ty.isVector()) {
|
|
B.buildTrunc(DstReg, ResultRegs[0]);
|
|
return true;
|
|
}
|
|
|
|
// Avoid a build/concat_vector of 1 entry.
|
|
if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
|
|
B.buildBitcast(DstReg, ResultRegs[0]);
|
|
return true;
|
|
}
|
|
|
|
assert(Ty.isVector());
|
|
|
|
if (IsD16) {
|
|
// For packed D16 results with TFE enabled, all the data components are
|
|
// S32. Cast back to the expected type.
|
|
//
|
|
// TODO: We don't really need to use load s32 elements. We would only need one
|
|
// cast for the TFE result if a multiple of v2s16 was used.
|
|
if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
|
|
for (Register &Reg : ResultRegs)
|
|
Reg = B.buildBitcast(V2S16, Reg).getReg(0);
|
|
} else if (ST.hasUnpackedD16VMem()) {
|
|
for (Register &Reg : ResultRegs)
|
|
Reg = B.buildTrunc(S16, Reg).getReg(0);
|
|
}
|
|
}
|
|
|
|
auto padWithUndef = [&](LLT Ty, int NumElts) {
|
|
if (NumElts == 0)
|
|
return;
|
|
Register Undef = B.buildUndef(Ty).getReg(0);
|
|
for (int I = 0; I != NumElts; ++I)
|
|
ResultRegs.push_back(Undef);
|
|
};
|
|
|
|
// Pad out any elements eliminated due to the dmask.
|
|
LLT ResTy = MRI->getType(ResultRegs[0]);
|
|
if (!ResTy.isVector()) {
|
|
padWithUndef(ResTy, NumElts - ResultRegs.size());
|
|
B.buildBuildVector(DstReg, ResultRegs);
|
|
return true;
|
|
}
|
|
|
|
assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
|
|
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
|
|
|
|
// Deal with the one annoying legal case.
|
|
const LLT V3S16 = LLT::fixed_vector(3, 16);
|
|
if (Ty == V3S16) {
|
|
if (IsTFE) {
|
|
if (ResultRegs.size() == 1) {
|
|
NewResultReg = ResultRegs[0];
|
|
} else if (ResultRegs.size() == 2) {
|
|
LLT V4S16 = LLT::fixed_vector(4, 16);
|
|
NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (MRI->getType(DstReg).getNumElements() <
|
|
MRI->getType(NewResultReg).getNumElements()) {
|
|
B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
|
|
} else {
|
|
B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
padWithUndef(ResTy, RegsToCover - ResultRegs.size());
|
|
B.buildConcatVectors(DstReg, ResultRegs);
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
GISelChangeObserver &Observer = Helper.Observer;
|
|
|
|
Register OrigDst = MI.getOperand(0).getReg();
|
|
Register Dst;
|
|
LLT Ty = B.getMRI()->getType(OrigDst);
|
|
unsigned Size = Ty.getSizeInBits();
|
|
MachineFunction &MF = B.getMF();
|
|
unsigned Opc = 0;
|
|
if (Size < 32 && ST.hasScalarSubwordLoads()) {
|
|
assert(Size == 8 || Size == 16);
|
|
Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
|
|
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
|
|
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
|
|
// destination register.
|
|
Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
|
|
} else {
|
|
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
|
|
Dst = OrigDst;
|
|
}
|
|
|
|
Observer.changingInstr(MI);
|
|
|
|
// Handle needing to s.buffer.load() a p8 value.
|
|
if (hasBufferRsrcWorkaround(Ty)) {
|
|
Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
|
|
B.setInsertPt(B.getMBB(), MI);
|
|
}
|
|
if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
|
|
Ty = getBitcastRegisterType(Ty);
|
|
Helper.bitcastDst(MI, Ty, 0);
|
|
B.setInsertPt(B.getMBB(), MI);
|
|
}
|
|
|
|
// FIXME: We don't really need this intermediate instruction. The intrinsic
|
|
// should be fixed to have a memory operand. Since it's readnone, we're not
|
|
// allowed to add one.
|
|
MI.setDesc(B.getTII().get(Opc));
|
|
MI.removeOperand(1); // Remove intrinsic ID
|
|
|
|
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
|
|
const unsigned MemSize = (Size + 7) / 8;
|
|
const Align MemAlign = B.getDataLayout().getABITypeAlign(
|
|
getTypeForLLT(Ty, MF.getFunction().getContext()));
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
MachinePointerInfo(),
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant,
|
|
MemSize, MemAlign);
|
|
MI.addMemOperand(MF, MMO);
|
|
if (Dst != OrigDst) {
|
|
MI.getOperand(0).setReg(Dst);
|
|
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
|
|
B.buildTrunc(OrigDst, Dst);
|
|
}
|
|
|
|
// If we don't have 96-bit result scalar loads, widening to 128-bit should
|
|
// always be legal. We may need to restore this to a 96-bit result if it turns
|
|
// out this needs to be converted to a vector load during RegBankSelect.
|
|
if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
|
|
if (Ty.isVector())
|
|
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
|
|
else
|
|
Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
|
|
}
|
|
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
GISelChangeObserver &Observer = Helper.Observer;
|
|
Observer.changingInstr(MI);
|
|
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
|
|
MI.removeOperand(0); // Remove intrinsic ID
|
|
castBufferRsrcArgToV4I32(MI, B, 0);
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
|
|
// TODO: Move to selection
|
|
bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
if (!ST.isTrapHandlerEnabled() ||
|
|
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
|
|
return legalizeTrapEndpgm(MI, MRI, B);
|
|
|
|
return ST.supportsGetDoorbellID() ?
|
|
legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
const DebugLoc &DL = MI.getDebugLoc();
|
|
MachineBasicBlock &BB = B.getMBB();
|
|
MachineFunction *MF = BB.getParent();
|
|
|
|
if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
|
|
BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
|
|
.addImm(0);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// We need a block split to make the real endpgm a terminator. We also don't
|
|
// want to break phis in successor blocks, so we can't just delete to the
|
|
// end of the block.
|
|
BB.splitAt(MI, false /*UpdateLiveIns*/);
|
|
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
|
|
MF->push_back(TrapBB);
|
|
BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
|
|
.addImm(0);
|
|
BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
|
|
.addMBB(TrapBB);
|
|
|
|
BB.addSuccessor(TrapBB);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
|
MachineFunction &MF = B.getMF();
|
|
const LLT S64 = LLT::scalar(64);
|
|
|
|
Register SGPR01(AMDGPU::SGPR0_SGPR1);
|
|
// For code object version 5, queue_ptr is passed through implicit kernarg.
|
|
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
|
|
AMDGPU::AMDHSA_COV5) {
|
|
AMDGPUTargetLowering::ImplicitParameter Param =
|
|
AMDGPUTargetLowering::QUEUE_PTR;
|
|
uint64_t Offset =
|
|
ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
|
|
|
|
Register KernargPtrReg = MRI.createGenericVirtualRegister(
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
|
|
if (!loadInputValue(KernargPtrReg, B,
|
|
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
|
|
return false;
|
|
|
|
// TODO: can we be smarter about machine pointer info?
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
PtrInfo,
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant,
|
|
LLT::scalar(64), commonAlignment(Align(64), Offset));
|
|
|
|
// Pointer address
|
|
Register LoadAddr = MRI.createGenericVirtualRegister(
|
|
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
B.buildPtrAdd(LoadAddr, KernargPtrReg,
|
|
B.buildConstant(LLT::scalar(64), Offset).getReg(0));
|
|
// Load address
|
|
Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
|
|
B.buildCopy(SGPR01, Temp);
|
|
B.buildInstr(AMDGPU::S_TRAP)
|
|
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
|
|
.addReg(SGPR01, RegState::Implicit);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
// Pass queue pointer to trap handler as input, and insert trap instruction
|
|
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
|
|
Register LiveIn =
|
|
MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
|
if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
|
|
return false;
|
|
|
|
B.buildCopy(SGPR01, LiveIn);
|
|
B.buildInstr(AMDGPU::S_TRAP)
|
|
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
|
|
.addReg(SGPR01, RegState::Implicit);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
// We need to simulate the 's_trap 2' instruction on targets that run in
|
|
// PRIV=1 (where it is treated as a nop).
|
|
if (ST.hasPrivEnabledTrap2NopBug()) {
|
|
ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
|
|
MI.getDebugLoc());
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
B.buildInstr(AMDGPU::S_TRAP)
|
|
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
// Is non-HSA path or trap-handler disabled? Then, report a warning
|
|
// accordingly
|
|
if (!ST.isTrapHandlerEnabled() ||
|
|
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
|
|
DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
|
|
"debugtrap handler not supported",
|
|
MI.getDebugLoc(), DS_Warning);
|
|
LLVMContext &Ctx = B.getMF().getFunction().getContext();
|
|
Ctx.diagnose(NoTrap);
|
|
} else {
|
|
// Insert debug-trap instruction
|
|
B.buildInstr(AMDGPU::S_TRAP)
|
|
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
|
|
}
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
const LLT S16 = LLT::scalar(16);
|
|
const LLT S32 = LLT::scalar(32);
|
|
const LLT V2S16 = LLT::fixed_vector(2, 16);
|
|
const LLT V3S32 = LLT::fixed_vector(3, 32);
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
Register NodePtr = MI.getOperand(2).getReg();
|
|
Register RayExtent = MI.getOperand(3).getReg();
|
|
Register RayOrigin = MI.getOperand(4).getReg();
|
|
Register RayDir = MI.getOperand(5).getReg();
|
|
Register RayInvDir = MI.getOperand(6).getReg();
|
|
Register TDescr = MI.getOperand(7).getReg();
|
|
|
|
if (!ST.hasGFX10_AEncoding()) {
|
|
DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
|
|
"intrinsic not supported on subtarget",
|
|
MI.getDebugLoc());
|
|
B.getMF().getFunction().getContext().diagnose(BadIntrin);
|
|
return false;
|
|
}
|
|
|
|
const bool IsGFX11 = AMDGPU::isGFX11(ST);
|
|
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
|
|
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
|
|
const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
|
|
const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
|
|
const unsigned NumVDataDwords = 4;
|
|
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
|
|
const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
|
|
const bool UseNSA =
|
|
IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
|
|
|
|
const unsigned BaseOpcodes[2][2] = {
|
|
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
|
|
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
|
|
AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
|
|
int Opcode;
|
|
if (UseNSA) {
|
|
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
|
|
IsGFX12Plus ? AMDGPU::MIMGEncGfx12
|
|
: IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
|
|
: AMDGPU::MIMGEncGfx10NSA,
|
|
NumVDataDwords, NumVAddrDwords);
|
|
} else {
|
|
assert(!IsGFX12Plus);
|
|
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
|
|
IsGFX11 ? AMDGPU::MIMGEncGfx11Default
|
|
: AMDGPU::MIMGEncGfx10Default,
|
|
NumVDataDwords, NumVAddrDwords);
|
|
}
|
|
assert(Opcode != -1);
|
|
|
|
SmallVector<Register, 12> Ops;
|
|
if (UseNSA && IsGFX11Plus) {
|
|
auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
|
|
auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
|
|
auto Merged = B.buildMergeLikeInstr(
|
|
V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
|
|
Ops.push_back(Merged.getReg(0));
|
|
};
|
|
|
|
Ops.push_back(NodePtr);
|
|
Ops.push_back(RayExtent);
|
|
packLanes(RayOrigin);
|
|
|
|
if (IsA16) {
|
|
auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
|
|
auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
|
|
auto MergedDir = B.buildMergeLikeInstr(
|
|
V3S32,
|
|
{B.buildBitcast(
|
|
S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
|
|
UnmergeRayDir.getReg(0)}))
|
|
.getReg(0),
|
|
B.buildBitcast(
|
|
S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
|
|
UnmergeRayDir.getReg(1)}))
|
|
.getReg(0),
|
|
B.buildBitcast(
|
|
S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
|
|
UnmergeRayDir.getReg(2)}))
|
|
.getReg(0)});
|
|
Ops.push_back(MergedDir.getReg(0));
|
|
} else {
|
|
packLanes(RayDir);
|
|
packLanes(RayInvDir);
|
|
}
|
|
} else {
|
|
if (Is64) {
|
|
auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
|
|
Ops.push_back(Unmerge.getReg(0));
|
|
Ops.push_back(Unmerge.getReg(1));
|
|
} else {
|
|
Ops.push_back(NodePtr);
|
|
}
|
|
Ops.push_back(RayExtent);
|
|
|
|
auto packLanes = [&Ops, &S32, &B](Register Src) {
|
|
auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
|
|
Ops.push_back(Unmerge.getReg(0));
|
|
Ops.push_back(Unmerge.getReg(1));
|
|
Ops.push_back(Unmerge.getReg(2));
|
|
};
|
|
|
|
packLanes(RayOrigin);
|
|
if (IsA16) {
|
|
auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
|
|
auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
|
|
Register R1 = MRI.createGenericVirtualRegister(S32);
|
|
Register R2 = MRI.createGenericVirtualRegister(S32);
|
|
Register R3 = MRI.createGenericVirtualRegister(S32);
|
|
B.buildMergeLikeInstr(R1,
|
|
{UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
|
|
B.buildMergeLikeInstr(
|
|
R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
|
|
B.buildMergeLikeInstr(
|
|
R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
|
|
Ops.push_back(R1);
|
|
Ops.push_back(R2);
|
|
Ops.push_back(R3);
|
|
} else {
|
|
packLanes(RayDir);
|
|
packLanes(RayInvDir);
|
|
}
|
|
}
|
|
|
|
if (!UseNSA) {
|
|
// Build a single vector containing all the operands so far prepared.
|
|
LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
|
|
Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
|
|
Ops.clear();
|
|
Ops.push_back(MergedOps);
|
|
}
|
|
|
|
auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
|
|
.addDef(DstReg)
|
|
.addImm(Opcode);
|
|
|
|
for (Register R : Ops) {
|
|
MIB.addUse(R);
|
|
}
|
|
|
|
MIB.addUse(TDescr)
|
|
.addImm(IsA16 ? 1 : 0)
|
|
.cloneMemRefs(MI);
|
|
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
const SITargetLowering *TLI = ST.getTargetLowering();
|
|
Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
|
|
MachineIRBuilder &B) const {
|
|
// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
|
|
if (!ST.hasArchitectedSGPRs())
|
|
return false;
|
|
LLT S32 = LLT::scalar(32);
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
|
|
auto LSB = B.buildConstant(S32, 25);
|
|
auto Width = B.buildConstant(S32, 5);
|
|
B.buildUbfx(DstReg, TTMP8, LSB, Width);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
static constexpr unsigned FPEnvModeBitField =
|
|
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
|
|
|
|
static constexpr unsigned FPEnvTrapBitField =
|
|
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
|
|
|
|
bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Src = MI.getOperand(0).getReg();
|
|
if (MRI.getType(Src) != S64)
|
|
return false;
|
|
|
|
auto ModeReg =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
|
|
/*HasSideEffects=*/true, /*isConvergent=*/false)
|
|
.addImm(FPEnvModeBitField);
|
|
auto TrapReg =
|
|
B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
|
|
/*HasSideEffects=*/true, /*isConvergent=*/false)
|
|
.addImm(FPEnvTrapBitField);
|
|
B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const {
|
|
Register Src = MI.getOperand(0).getReg();
|
|
if (MRI.getType(Src) != S64)
|
|
return false;
|
|
|
|
auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
|
|
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
|
|
/*HasSideEffects=*/true, /*isConvergent=*/false)
|
|
.addImm(static_cast<int16_t>(FPEnvModeBitField))
|
|
.addReg(Unmerge.getReg(0));
|
|
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
|
|
/*HasSideEffects=*/true, /*isConvergent=*/false)
|
|
.addImm(static_cast<int16_t>(FPEnvTrapBitField))
|
|
.addReg(Unmerge.getReg(1));
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const {
|
|
MachineIRBuilder &B = Helper.MIRBuilder;
|
|
MachineRegisterInfo &MRI = *B.getMRI();
|
|
|
|
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
|
|
auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
|
|
switch (IntrID) {
|
|
case Intrinsic::amdgcn_if:
|
|
case Intrinsic::amdgcn_else: {
|
|
MachineInstr *Br = nullptr;
|
|
MachineBasicBlock *UncondBrTarget = nullptr;
|
|
bool Negated = false;
|
|
if (MachineInstr *BrCond =
|
|
verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
|
|
const SIRegisterInfo *TRI
|
|
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
|
|
|
|
Register Def = MI.getOperand(1).getReg();
|
|
Register Use = MI.getOperand(3).getReg();
|
|
|
|
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
|
|
|
|
if (Negated)
|
|
std::swap(CondBrTarget, UncondBrTarget);
|
|
|
|
B.setInsertPt(B.getMBB(), BrCond->getIterator());
|
|
if (IntrID == Intrinsic::amdgcn_if) {
|
|
B.buildInstr(AMDGPU::SI_IF)
|
|
.addDef(Def)
|
|
.addUse(Use)
|
|
.addMBB(UncondBrTarget);
|
|
} else {
|
|
B.buildInstr(AMDGPU::SI_ELSE)
|
|
.addDef(Def)
|
|
.addUse(Use)
|
|
.addMBB(UncondBrTarget);
|
|
}
|
|
|
|
if (Br) {
|
|
Br->getOperand(0).setMBB(CondBrTarget);
|
|
} else {
|
|
// The IRTranslator skips inserting the G_BR for fallthrough cases, but
|
|
// since we're swapping branch targets it needs to be reinserted.
|
|
// FIXME: IRTranslator should probably not do this
|
|
B.buildBr(*CondBrTarget);
|
|
}
|
|
|
|
MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
|
|
MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
|
|
MI.eraseFromParent();
|
|
BrCond->eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
case Intrinsic::amdgcn_loop: {
|
|
MachineInstr *Br = nullptr;
|
|
MachineBasicBlock *UncondBrTarget = nullptr;
|
|
bool Negated = false;
|
|
if (MachineInstr *BrCond =
|
|
verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
|
|
const SIRegisterInfo *TRI
|
|
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
|
|
|
|
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
|
|
Register Reg = MI.getOperand(2).getReg();
|
|
|
|
if (Negated)
|
|
std::swap(CondBrTarget, UncondBrTarget);
|
|
|
|
B.setInsertPt(B.getMBB(), BrCond->getIterator());
|
|
B.buildInstr(AMDGPU::SI_LOOP)
|
|
.addUse(Reg)
|
|
.addMBB(UncondBrTarget);
|
|
|
|
if (Br)
|
|
Br->getOperand(0).setMBB(CondBrTarget);
|
|
else
|
|
B.buildBr(*CondBrTarget);
|
|
|
|
MI.eraseFromParent();
|
|
BrCond->eraseFromParent();
|
|
MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
case Intrinsic::amdgcn_addrspacecast_nonnull:
|
|
return legalizeAddrSpaceCast(MI, MRI, B);
|
|
case Intrinsic::amdgcn_make_buffer_rsrc:
|
|
return legalizePointerAsRsrcIntrin(MI, MRI, B);
|
|
case Intrinsic::amdgcn_kernarg_segment_ptr:
|
|
if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
|
|
// This only makes sense to call in a kernel, so just lower to null.
|
|
B.buildConstant(MI.getOperand(0).getReg(), 0);
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
return legalizePreloadedArgIntrin(
|
|
MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
|
case Intrinsic::amdgcn_implicitarg_ptr:
|
|
return legalizeImplicitArgPtr(MI, MRI, B);
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
|
|
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
|
|
case Intrinsic::amdgcn_workgroup_id_x:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
|
|
case Intrinsic::amdgcn_workgroup_id_y:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
|
|
case Intrinsic::amdgcn_workgroup_id_z:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
|
|
case Intrinsic::amdgcn_wave_id:
|
|
return legalizeWaveID(MI, B);
|
|
case Intrinsic::amdgcn_lds_kernel_id:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
|
|
case Intrinsic::amdgcn_dispatch_ptr:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::DISPATCH_PTR);
|
|
case Intrinsic::amdgcn_queue_ptr:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::QUEUE_PTR);
|
|
case Intrinsic::amdgcn_implicit_buffer_ptr:
|
|
return legalizePreloadedArgIntrin(
|
|
MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
|
|
case Intrinsic::amdgcn_dispatch_id:
|
|
return legalizePreloadedArgIntrin(MI, MRI, B,
|
|
AMDGPUFunctionArgInfo::DISPATCH_ID);
|
|
case Intrinsic::r600_read_ngroups_x:
|
|
// TODO: Emit error for hsa
|
|
return legalizeKernargMemParameter(MI, B,
|
|
SI::KernelInputOffsets::NGROUPS_X);
|
|
case Intrinsic::r600_read_ngroups_y:
|
|
return legalizeKernargMemParameter(MI, B,
|
|
SI::KernelInputOffsets::NGROUPS_Y);
|
|
case Intrinsic::r600_read_ngroups_z:
|
|
return legalizeKernargMemParameter(MI, B,
|
|
SI::KernelInputOffsets::NGROUPS_Z);
|
|
case Intrinsic::r600_read_local_size_x:
|
|
// TODO: Could insert G_ASSERT_ZEXT from s16
|
|
return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
|
|
case Intrinsic::r600_read_local_size_y:
|
|
// TODO: Could insert G_ASSERT_ZEXT from s16
|
|
return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
|
|
// TODO: Could insert G_ASSERT_ZEXT from s16
|
|
case Intrinsic::r600_read_local_size_z:
|
|
return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
|
|
case Intrinsic::r600_read_global_size_x:
|
|
return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
|
|
case Intrinsic::r600_read_global_size_y:
|
|
return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
|
|
case Intrinsic::r600_read_global_size_z:
|
|
return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
|
|
case Intrinsic::amdgcn_fdiv_fast:
|
|
return legalizeFDIVFastIntrin(MI, MRI, B);
|
|
case Intrinsic::amdgcn_is_shared:
|
|
return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
|
|
case Intrinsic::amdgcn_is_private:
|
|
return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
|
|
case Intrinsic::amdgcn_wavefrontsize: {
|
|
B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
case Intrinsic::amdgcn_s_buffer_load:
|
|
return legalizeSBufferLoad(Helper, MI);
|
|
case Intrinsic::amdgcn_raw_buffer_store:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_store:
|
|
case Intrinsic::amdgcn_struct_buffer_store:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_store:
|
|
return legalizeBufferStore(MI, Helper, false, false);
|
|
case Intrinsic::amdgcn_raw_buffer_store_format:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
|
|
case Intrinsic::amdgcn_struct_buffer_store_format:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
|
|
return legalizeBufferStore(MI, Helper, false, true);
|
|
case Intrinsic::amdgcn_raw_tbuffer_store:
|
|
case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
|
|
case Intrinsic::amdgcn_struct_tbuffer_store:
|
|
case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
|
|
return legalizeBufferStore(MI, Helper, true, true);
|
|
case Intrinsic::amdgcn_raw_buffer_load:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_load:
|
|
case Intrinsic::amdgcn_raw_atomic_buffer_load:
|
|
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
|
|
case Intrinsic::amdgcn_struct_buffer_load:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_load:
|
|
case Intrinsic::amdgcn_struct_atomic_buffer_load:
|
|
case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
|
|
return legalizeBufferLoad(MI, Helper, false, false);
|
|
case Intrinsic::amdgcn_raw_buffer_load_format:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
|
|
case Intrinsic::amdgcn_struct_buffer_load_format:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
|
|
return legalizeBufferLoad(MI, Helper, true, false);
|
|
case Intrinsic::amdgcn_raw_tbuffer_load:
|
|
case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
|
|
case Intrinsic::amdgcn_struct_tbuffer_load:
|
|
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
|
|
return legalizeBufferLoad(MI, Helper, true, true);
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
|
|
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
|
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
|
|
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
|
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
|
|
return legalizeBufferAtomic(MI, B, IntrID);
|
|
case Intrinsic::amdgcn_rsq_clamp:
|
|
return legalizeRsqClampIntrinsic(MI, MRI, B);
|
|
case Intrinsic::amdgcn_image_bvh_intersect_ray:
|
|
return legalizeBVHIntrinsic(MI, B);
|
|
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
|
|
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
|
|
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
|
|
case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
|
|
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
|
|
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
|
|
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
|
|
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
|
|
Register Index = MI.getOperand(5).getReg();
|
|
LLT S32 = LLT::scalar(32);
|
|
if (MRI.getType(Index) != S32)
|
|
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
|
|
return true;
|
|
}
|
|
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
|
|
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
|
|
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
|
|
Register Index = MI.getOperand(7).getReg();
|
|
LLT S32 = LLT::scalar(32);
|
|
if (MRI.getType(Index) != S32)
|
|
MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
|
|
return true;
|
|
}
|
|
case Intrinsic::amdgcn_fmed3: {
|
|
GISelChangeObserver &Observer = Helper.Observer;
|
|
|
|
// FIXME: This is to workaround the inability of tablegen match combiners to
|
|
// match intrinsics in patterns.
|
|
Observer.changingInstr(MI);
|
|
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
|
|
MI.removeOperand(1);
|
|
Observer.changedInstr(MI);
|
|
return true;
|
|
}
|
|
case Intrinsic::amdgcn_readlane:
|
|
case Intrinsic::amdgcn_writelane:
|
|
case Intrinsic::amdgcn_readfirstlane:
|
|
case Intrinsic::amdgcn_permlane16:
|
|
case Intrinsic::amdgcn_permlanex16:
|
|
case Intrinsic::amdgcn_permlane64:
|
|
case Intrinsic::amdgcn_set_inactive:
|
|
case Intrinsic::amdgcn_set_inactive_chain_arg:
|
|
case Intrinsic::amdgcn_mov_dpp8:
|
|
case Intrinsic::amdgcn_update_dpp:
|
|
return legalizeLaneOp(Helper, MI, IntrID);
|
|
case Intrinsic::amdgcn_s_buffer_prefetch_data:
|
|
return legalizeSBufferPrefetch(Helper, MI);
|
|
default: {
|
|
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
|
AMDGPU::getImageDimIntrinsicInfo(IntrID))
|
|
return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|