The generic legalizer framework is still used to reduce the problem to scalar multiplication with the bit size a multiple of 32. Generating optimal code sequences for big integer multiplication is somewhat tricky and has a number of target-specific intricacies: - The target has V_MAD_U64_U32 instructions that multiply two 32-bit factors and add a 64-bit accumulator. Most partial products should use this instruction. - The accumulator is mapped to consecutive 32-bit GPRs, and partial- product multiply-adds can feed the accumulator into each other directly. (The register allocator's support for that is somewhat limited, but that only matters for 128-bit integers and larger.) - OTOH, on some hardware, V_MAD_U64_U32 requires the accumulator to be stored in an even-aligned pair of GPRs. To avoid excessive register copies, it makes sense to compute odd partial products separately from even partial products (where a partial product src0[j0] * src1[j1] is "odd" if j0 + j1 is odd) and add both halves together as a final step. - We can combine G_MUL+G_ADD into a single cascade of multiply-adds. - The target can keep many carry-bits in flight simultaneously, so combining carries using G_UADDE is preferable over G_ZEXT + G_ADD. - Not addressed by this patch: When the factors are sign-extended, the V_MAD_I64_I32 instruction (signed version!) can be used. It is difficult to address these points generically: 1) Finding matching pairs of G_MUL and G_UMULH to find a wide multiply is expensive. We could add a G_UMUL_LOHI generic instruction and conditionally use that in the generic legalizer, but by itself this wouldn't allow us to use the accumulation capability of V_MAD_U64_U32. One could attempt to find matching G_ADD + G_UADDE post-legalization, but this is also expensive. 2) Similarly, making sense of the legalization outcome of a wide pre-legalization G_MUL+G_ADD pair is extremely expensive. 3) How could the generic legalizer possibly deal with the particular idiosyncracy of "odd" vs. "even" partial products. All this points in the direction of directly emitting an ideal code sequence during legalization, but the generic legalizer should not be burdened with such overly target-specific concerns. Hence, a custom legalization. Note that the implemented approach is different from that used by SelectionDAG because narrowing of scalars works differently in general. SelectionDAG iteratively cuts wide scalars into low and high halves until a legal size is reached. By contrast, GlobalISel does the narrowing in a single shot, which should be better for compile-time and for the quality of the generated code. This patch leaves three gaps open: 1. When the factors are uniform, we should execute the multiplication on the SALU. Register bank mapping already ensures this. However, the resulting code sequence is not optimal because it doesn't fully use the carry-in capabilities of S_ADDC_U32. (V_MAD_U64_U32 doesn't have a carry-in.) It is very difficult to fix this after the fact, so we should really use a different legalization sequence in this case. Unfortunately, we don't have a divergence analysis and so cannot make that choice. (This only matters for 128-bit integers and larger.) 2. Avoid unnecessary multiplies when sources are known to be zero- or sign-extended. The challenge is that the legalizer does not currently have access to GISelKnownBits. 3. When the G_MUL is followed by a G_ADD, we should consider combining the two instructions into a single multiply-add sequence, to utilize the accumulator of V_MAD_U64_U32 fully. (Unless the multiply has multiple uses and the implied duplication of the multiply is an overall negative). However, this is also not true when the factors are uniform: in that case, it is generally better to *not* combine the two operations, so that the multiply can be done on the SALU. Again, we don't have a divergence analysis available and so cannot make an informed choice. Differential Revision: https://reviews.llvm.org/D124844
215 lines
10 KiB
C++
215 lines
10 KiB
C++
//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
/// \file
|
|
/// This file declares the targeting of the Machinelegalizer class for
|
|
/// AMDGPU.
|
|
/// \todo This should be generated by TableGen.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
|
|
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
|
|
|
|
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
|
|
#include "AMDGPUArgumentUsageInfo.h"
|
|
#include "SIInstrInfo.h"
|
|
|
|
namespace llvm {
|
|
|
|
class GCNTargetMachine;
|
|
class GCNSubtarget;
|
|
class MachineIRBuilder;
|
|
|
|
namespace AMDGPU {
|
|
struct ImageDimIntrinsicInfo;
|
|
}
|
|
/// This class provides the information for the target register banks.
|
|
class AMDGPULegalizerInfo final : public LegalizerInfo {
|
|
const GCNSubtarget &ST;
|
|
|
|
public:
|
|
AMDGPULegalizerInfo(const GCNSubtarget &ST,
|
|
const GCNTargetMachine &TM);
|
|
|
|
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
|
|
|
|
Register getSegmentAperture(unsigned AddrSpace,
|
|
MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool Signed) const;
|
|
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool Signed) const;
|
|
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const;
|
|
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B,
|
|
const GlobalValue *GV, int64_t Offset,
|
|
unsigned GAFlags = SIInstrInfo::MO_NONE) const;
|
|
|
|
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
|
|
|
|
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B,
|
|
double Log2BaseInverted) const;
|
|
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
|
|
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
|
|
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
|
|
ArrayRef<Register> Src0, ArrayRef<Register> Src1,
|
|
bool UsePartialMad64_32,
|
|
bool SeparateOddAlignedProducts) const;
|
|
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
|
|
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
|
|
const ArgDescriptor *Arg,
|
|
const TargetRegisterClass *ArgRC, LLT ArgTy) const;
|
|
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
|
|
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
|
|
|
|
bool legalizePreloadedArgIntrin(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
|
|
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
|
|
bool legalizeWorkitemIDIntrinsic(
|
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
|
|
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
|
|
|
|
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
|
|
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
|
|
uint64_t Offset,
|
|
Align Alignment = Align(4)) const;
|
|
|
|
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg,
|
|
Register DstRemReg, Register Num,
|
|
Register Den) const;
|
|
|
|
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg,
|
|
Register DstRemReg, Register Num,
|
|
Register Den) const;
|
|
|
|
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
|
|
MachineInstr &MI, Intrinsic::ID IID) const;
|
|
|
|
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, unsigned AddrSpace) const;
|
|
|
|
std::pair<Register, unsigned> splitBufferOffsets(MachineIRBuilder &B,
|
|
Register OrigOffset) const;
|
|
void updateBufferMMO(MachineMemOperand *MMO, Register VOffset,
|
|
Register SOffset, unsigned ImmOffset, Register VIndex,
|
|
MachineRegisterInfo &MRI) const;
|
|
|
|
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
|
|
Register Reg, bool ImageStore = false) const;
|
|
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool IsFormat) const;
|
|
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool IsFormat) const;
|
|
Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
|
|
bool IsFormat) const;
|
|
|
|
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool IsTyped,
|
|
bool IsFormat) const;
|
|
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B, bool IsFormat,
|
|
bool IsTyped) const;
|
|
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
|
|
Intrinsic::ID IID) const;
|
|
|
|
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
|
|
|
|
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
|
|
|
|
bool legalizeImageIntrinsic(
|
|
MachineInstr &MI, MachineIRBuilder &B,
|
|
GISelChangeObserver &Observer,
|
|
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
|
|
|
|
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
|
|
|
|
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
|
|
bool IsInc) const;
|
|
|
|
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
|
|
MachineIRBuilder &B) const;
|
|
|
|
bool legalizeIntrinsic(LegalizerHelper &Helper,
|
|
MachineInstr &MI) const override;
|
|
};
|
|
} // End llvm namespace.
|
|
#endif
|