When replacing certain AMDGPU library calls with constant data vectors, the existing implementation only handled single and double-precision floats. This change extends the functionality to also support half-precision floats. Additionally, it refactors the function responsible for generating constant float data vectors to improve readability and reduces code duplication. In tandem with this refactoring, the patch relaxes the check for constant data vectors to include any constant of vector type. This allows other constant vectors to be processed, such as those created from constant aggregate zeros (e.g. `<2 x float> zeroinitializer`). --------- Signed-off-by: Steffen Holst Larsen <sholstla@amd.com>
2063 lines
68 KiB
C++
2063 lines
68 KiB
C++
//===- AMDGPULibCalls.cpp -------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// This file does AMD library function optimizations.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPULibFunc.h"
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/IR/AttributeMask.h"
|
|
#include "llvm/IR/Dominators.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/MDBuilder.h"
|
|
#include "llvm/IR/PatternMatch.h"
|
|
#include <cmath>
|
|
|
|
#define DEBUG_TYPE "amdgpu-simplifylib"
|
|
|
|
using namespace llvm;
|
|
using namespace llvm::PatternMatch;
|
|
|
|
static cl::opt<bool> EnablePreLink("amdgpu-prelink",
|
|
cl::desc("Enable pre-link mode optimizations"),
|
|
cl::init(false),
|
|
cl::Hidden);
|
|
|
|
static cl::list<std::string> UseNative("amdgpu-use-native",
|
|
cl::desc("Comma separated list of functions to replace with native, or all"),
|
|
cl::CommaSeparated, cl::ValueOptional,
|
|
cl::Hidden);
|
|
|
|
#define MATH_PI numbers::pi
|
|
#define MATH_E numbers::e
|
|
#define MATH_SQRT2 numbers::sqrt2
|
|
#define MATH_SQRT1_2 numbers::inv_sqrt2
|
|
|
|
enum class PowKind { Pow, PowR, PowN, RootN };
|
|
|
|
namespace llvm {
|
|
|
|
class AMDGPULibCalls {
|
|
private:
|
|
SimplifyQuery SQ;
|
|
|
|
using FuncInfo = llvm::AMDGPULibFunc;
|
|
|
|
// -fuse-native.
|
|
bool AllNative = false;
|
|
|
|
bool useNativeFunc(const StringRef F) const;
|
|
|
|
// Return a pointer (pointer expr) to the function if function definition with
|
|
// "FuncName" exists. It may create a new function prototype in pre-link mode.
|
|
FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
|
|
|
|
/// Wrapper around getFunction which tries to use a faster variant if
|
|
/// available, and falls back to a less fast option.
|
|
///
|
|
/// Return a replacement function for \p fInfo that has float-typed fast
|
|
/// variants. \p NewFunc is a base replacement function to use. \p
|
|
/// NewFuncFastVariant is a faster version to use if the calling context knows
|
|
/// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
|
|
/// should be EI_NONE.
|
|
FunctionCallee getFloatFastVariant(Module *M, const FuncInfo &fInfo,
|
|
FuncInfo &newInfo,
|
|
AMDGPULibFunc::EFuncId NewFunc,
|
|
AMDGPULibFunc::EFuncId NewFuncFastVariant);
|
|
|
|
bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
|
|
|
|
bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
|
|
|
|
/* Specialized optimizations */
|
|
|
|
// pow/powr/pown
|
|
bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
|
|
|
|
/// Peform a fast math expansion of pow, powr, pown or rootn.
|
|
bool expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B, PowKind Kind);
|
|
|
|
bool tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
|
|
const FuncInfo &FInfo);
|
|
|
|
// rootn
|
|
bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
|
|
|
|
// -fuse-native for sincos
|
|
bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
|
|
|
|
// evaluate calls if calls' arguments are constants.
|
|
bool evaluateScalarMathFunc(const FuncInfo &FInfo, APFloat &Res0,
|
|
APFloat &Res1, Constant *copr0, Constant *copr1);
|
|
bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
|
|
|
|
/// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
|
|
/// of cos, sincos call).
|
|
std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
|
|
FastMathFlags FMF,
|
|
IRBuilder<> &B,
|
|
FunctionCallee Fsincos);
|
|
|
|
// sin/cos
|
|
bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
|
|
|
|
// __read_pipe/__write_pipe
|
|
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
|
|
const FuncInfo &FInfo);
|
|
|
|
// Get a scalar native builtin single argument FP function
|
|
FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
|
|
|
|
/// Substitute a call to a known libcall with an intrinsic call. If \p
|
|
/// AllowMinSize is true, allow the replacement in a minsize function.
|
|
bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
|
|
bool AllowMinSizeF32 = false,
|
|
bool AllowF64 = false,
|
|
bool AllowStrictFP = false);
|
|
void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
|
|
Intrinsic::ID IntrID);
|
|
|
|
bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
|
|
Intrinsic::ID IntrID,
|
|
bool AllowMinSizeF32 = false,
|
|
bool AllowF64 = false,
|
|
bool AllowStrictFP = false);
|
|
|
|
protected:
|
|
bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
|
|
|
|
bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
|
|
|
|
static void replaceCall(Instruction *I, Value *With) {
|
|
I->replaceAllUsesWith(With);
|
|
I->eraseFromParent();
|
|
}
|
|
|
|
static void replaceCall(FPMathOperator *I, Value *With) {
|
|
replaceCall(cast<Instruction>(I), With);
|
|
}
|
|
|
|
public:
|
|
AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
|
|
|
|
bool fold(CallInst *CI);
|
|
|
|
void initNativeFuncs();
|
|
|
|
// Replace a normal math function call with that native version
|
|
bool useNative(CallInst *CI);
|
|
};
|
|
|
|
} // end namespace llvm
|
|
|
|
template <typename IRB>
|
|
static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
|
|
const Twine &Name = "") {
|
|
CallInst *R = B.CreateCall(Callee, Arg, Name);
|
|
if (Function *F = dyn_cast<Function>(Callee.getCallee()))
|
|
R->setCallingConv(F->getCallingConv());
|
|
return R;
|
|
}
|
|
|
|
template <typename IRB>
|
|
static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
|
|
Value *Arg2, const Twine &Name = "") {
|
|
CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
|
|
if (Function *F = dyn_cast<Function>(Callee.getCallee()))
|
|
R->setCallingConv(F->getCallingConv());
|
|
return R;
|
|
}
|
|
|
|
static FunctionType *getPownType(FunctionType *FT) {
|
|
Type *PowNExpTy = Type::getInt32Ty(FT->getContext());
|
|
if (VectorType *VecTy = dyn_cast<VectorType>(FT->getReturnType()))
|
|
PowNExpTy = VectorType::get(PowNExpTy, VecTy->getElementCount());
|
|
|
|
return FunctionType::get(FT->getReturnType(),
|
|
{FT->getParamType(0), PowNExpTy}, false);
|
|
}
|
|
|
|
// Data structures for table-driven optimizations.
|
|
// FuncTbl works for both f32 and f64 functions with 1 input argument
|
|
|
|
struct TableEntry {
|
|
double result;
|
|
double input;
|
|
};
|
|
|
|
/* a list of {result, input} */
|
|
static const TableEntry tbl_acos[] = {
|
|
{MATH_PI / 2.0, 0.0},
|
|
{MATH_PI / 2.0, -0.0},
|
|
{0.0, 1.0},
|
|
{MATH_PI, -1.0}
|
|
};
|
|
static const TableEntry tbl_acosh[] = {
|
|
{0.0, 1.0}
|
|
};
|
|
static const TableEntry tbl_acospi[] = {
|
|
{0.5, 0.0},
|
|
{0.5, -0.0},
|
|
{0.0, 1.0},
|
|
{1.0, -1.0}
|
|
};
|
|
static const TableEntry tbl_asin[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0},
|
|
{MATH_PI / 2.0, 1.0},
|
|
{-MATH_PI / 2.0, -1.0}
|
|
};
|
|
static const TableEntry tbl_asinh[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_asinpi[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0},
|
|
{0.5, 1.0},
|
|
{-0.5, -1.0}
|
|
};
|
|
static const TableEntry tbl_atan[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0},
|
|
{MATH_PI / 4.0, 1.0},
|
|
{-MATH_PI / 4.0, -1.0}
|
|
};
|
|
static const TableEntry tbl_atanh[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_atanpi[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0},
|
|
{0.25, 1.0},
|
|
{-0.25, -1.0}
|
|
};
|
|
static const TableEntry tbl_cbrt[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0},
|
|
{1.0, 1.0},
|
|
{-1.0, -1.0},
|
|
};
|
|
static const TableEntry tbl_cos[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_cosh[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_cospi[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_erfc[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_erf[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_exp[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0},
|
|
{MATH_E, 1.0}
|
|
};
|
|
static const TableEntry tbl_exp2[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0},
|
|
{2.0, 1.0}
|
|
};
|
|
static const TableEntry tbl_exp10[] = {
|
|
{1.0, 0.0},
|
|
{1.0, -0.0},
|
|
{10.0, 1.0}
|
|
};
|
|
static const TableEntry tbl_expm1[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_log[] = {
|
|
{0.0, 1.0},
|
|
{1.0, MATH_E}
|
|
};
|
|
static const TableEntry tbl_log2[] = {
|
|
{0.0, 1.0},
|
|
{1.0, 2.0}
|
|
};
|
|
static const TableEntry tbl_log10[] = {
|
|
{0.0, 1.0},
|
|
{1.0, 10.0}
|
|
};
|
|
static const TableEntry tbl_rsqrt[] = {
|
|
{1.0, 1.0},
|
|
{MATH_SQRT1_2, 2.0}
|
|
};
|
|
static const TableEntry tbl_sin[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_sinh[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_sinpi[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_sqrt[] = {
|
|
{0.0, 0.0},
|
|
{1.0, 1.0},
|
|
{MATH_SQRT2, 2.0}
|
|
};
|
|
static const TableEntry tbl_tan[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_tanh[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_tanpi[] = {
|
|
{0.0, 0.0},
|
|
{-0.0, -0.0}
|
|
};
|
|
static const TableEntry tbl_tgamma[] = {
|
|
{1.0, 1.0},
|
|
{1.0, 2.0},
|
|
{2.0, 3.0},
|
|
{6.0, 4.0}
|
|
};
|
|
|
|
static bool HasNative(AMDGPULibFunc::EFuncId id) {
|
|
switch(id) {
|
|
case AMDGPULibFunc::EI_DIVIDE:
|
|
case AMDGPULibFunc::EI_COS:
|
|
case AMDGPULibFunc::EI_EXP:
|
|
case AMDGPULibFunc::EI_EXP2:
|
|
case AMDGPULibFunc::EI_EXP10:
|
|
case AMDGPULibFunc::EI_LOG:
|
|
case AMDGPULibFunc::EI_LOG2:
|
|
case AMDGPULibFunc::EI_LOG10:
|
|
case AMDGPULibFunc::EI_POWR:
|
|
case AMDGPULibFunc::EI_RECIP:
|
|
case AMDGPULibFunc::EI_RSQRT:
|
|
case AMDGPULibFunc::EI_SIN:
|
|
case AMDGPULibFunc::EI_SINCOS:
|
|
case AMDGPULibFunc::EI_SQRT:
|
|
case AMDGPULibFunc::EI_TAN:
|
|
return true;
|
|
default:;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
using TableRef = ArrayRef<TableEntry>;
|
|
|
|
static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
|
|
switch(id) {
|
|
case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
|
|
case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
|
|
case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
|
|
case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
|
|
case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
|
|
case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
|
|
case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
|
|
case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
|
|
case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
|
|
case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
|
|
case AMDGPULibFunc::EI_NCOS:
|
|
case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
|
|
case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
|
|
case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
|
|
case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
|
|
case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
|
|
case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
|
|
case AMDGPULibFunc::EI_NEXP2:
|
|
case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
|
|
case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
|
|
case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
|
|
case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
|
|
case AMDGPULibFunc::EI_NLOG2:
|
|
case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
|
|
case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
|
|
case AMDGPULibFunc::EI_NRSQRT:
|
|
case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
|
|
case AMDGPULibFunc::EI_NSIN:
|
|
case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
|
|
case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
|
|
case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
|
|
case AMDGPULibFunc::EI_NSQRT:
|
|
case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
|
|
case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
|
|
case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
|
|
case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
|
|
case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
|
|
default:;
|
|
}
|
|
return TableRef();
|
|
}
|
|
|
|
static inline int getVecSize(const AMDGPULibFunc& FInfo) {
|
|
return FInfo.getLeads()[0].VectorSize;
|
|
}
|
|
|
|
static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
|
|
return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
|
|
}
|
|
|
|
FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
|
|
// If we are doing PreLinkOpt, the function is external. So it is safe to
|
|
// use getOrInsertFunction() at this stage.
|
|
|
|
return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
|
|
: AMDGPULibFunc::getFunction(M, fInfo);
|
|
}
|
|
|
|
FunctionCallee AMDGPULibCalls::getFloatFastVariant(
|
|
Module *M, const FuncInfo &fInfo, FuncInfo &newInfo,
|
|
AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
|
|
assert(NewFunc != FastVariant);
|
|
|
|
if (FastVariant != AMDGPULibFunc::EI_NONE &&
|
|
getArgType(fInfo) == AMDGPULibFunc::F32) {
|
|
newInfo = AMDGPULibFunc(FastVariant, fInfo);
|
|
if (FunctionCallee NewCallee = getFunction(M, newInfo))
|
|
return NewCallee;
|
|
}
|
|
|
|
newInfo = AMDGPULibFunc(NewFunc, fInfo);
|
|
return getFunction(M, newInfo);
|
|
}
|
|
|
|
bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
|
|
FuncInfo &FInfo) {
|
|
return AMDGPULibFunc::parse(FMangledName, FInfo);
|
|
}
|
|
|
|
bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
|
|
return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
|
|
}
|
|
|
|
bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
|
|
const FPMathOperator *FPOp) const {
|
|
// TODO: Refine to approxFunc or contract
|
|
return FPOp->isFast();
|
|
}
|
|
|
|
AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
|
|
: SQ(F.getParent()->getDataLayout(),
|
|
&FAM.getResult<TargetLibraryAnalysis>(F),
|
|
FAM.getCachedResult<DominatorTreeAnalysis>(F),
|
|
&FAM.getResult<AssumptionAnalysis>(F)) {}
|
|
|
|
bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
|
|
return AllNative || llvm::is_contained(UseNative, F);
|
|
}
|
|
|
|
void AMDGPULibCalls::initNativeFuncs() {
|
|
AllNative = useNativeFunc("all") ||
|
|
(UseNative.getNumOccurrences() && UseNative.size() == 1 &&
|
|
UseNative.begin()->empty());
|
|
}
|
|
|
|
bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
|
|
bool native_sin = useNativeFunc("sin");
|
|
bool native_cos = useNativeFunc("cos");
|
|
|
|
if (native_sin && native_cos) {
|
|
Module *M = aCI->getModule();
|
|
Value *opr0 = aCI->getArgOperand(0);
|
|
|
|
AMDGPULibFunc nf;
|
|
nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
|
|
nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
|
|
|
|
nf.setPrefix(AMDGPULibFunc::NATIVE);
|
|
nf.setId(AMDGPULibFunc::EI_SIN);
|
|
FunctionCallee sinExpr = getFunction(M, nf);
|
|
|
|
nf.setPrefix(AMDGPULibFunc::NATIVE);
|
|
nf.setId(AMDGPULibFunc::EI_COS);
|
|
FunctionCallee cosExpr = getFunction(M, nf);
|
|
if (sinExpr && cosExpr) {
|
|
Value *sinval =
|
|
CallInst::Create(sinExpr, opr0, "splitsin", aCI->getIterator());
|
|
Value *cosval =
|
|
CallInst::Create(cosExpr, opr0, "splitcos", aCI->getIterator());
|
|
new StoreInst(cosval, aCI->getArgOperand(1), aCI->getIterator());
|
|
|
|
DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
|
|
<< " with native version of sin/cos");
|
|
|
|
replaceCall(aCI, sinval);
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPULibCalls::useNative(CallInst *aCI) {
|
|
Function *Callee = aCI->getCalledFunction();
|
|
if (!Callee || aCI->isNoBuiltin())
|
|
return false;
|
|
|
|
FuncInfo FInfo;
|
|
if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() ||
|
|
FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
|
|
getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
|
|
!(AllNative || useNativeFunc(FInfo.getName()))) {
|
|
return false;
|
|
}
|
|
|
|
if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
|
|
return sincosUseNative(aCI, FInfo);
|
|
|
|
FInfo.setPrefix(AMDGPULibFunc::NATIVE);
|
|
FunctionCallee F = getFunction(aCI->getModule(), FInfo);
|
|
if (!F)
|
|
return false;
|
|
|
|
aCI->setCalledFunction(F);
|
|
DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
|
|
<< " with native version");
|
|
return true;
|
|
}
|
|
|
|
// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
|
|
// builtin, with appended type size and alignment arguments, where 2 or 4
|
|
// indicates the original number of arguments. The library has optimized version
|
|
// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
|
|
// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
|
|
// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
|
|
// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
|
|
bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
|
|
const FuncInfo &FInfo) {
|
|
auto *Callee = CI->getCalledFunction();
|
|
if (!Callee->isDeclaration())
|
|
return false;
|
|
|
|
assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
|
|
auto *M = Callee->getParent();
|
|
std::string Name = std::string(Callee->getName());
|
|
auto NumArg = CI->arg_size();
|
|
if (NumArg != 4 && NumArg != 6)
|
|
return false;
|
|
ConstantInt *PacketSize =
|
|
dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 2));
|
|
ConstantInt *PacketAlign =
|
|
dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 1));
|
|
if (!PacketSize || !PacketAlign)
|
|
return false;
|
|
|
|
unsigned Size = PacketSize->getZExtValue();
|
|
Align Alignment = PacketAlign->getAlignValue();
|
|
if (Alignment != Size)
|
|
return false;
|
|
|
|
unsigned PtrArgLoc = CI->arg_size() - 3;
|
|
Value *PtrArg = CI->getArgOperand(PtrArgLoc);
|
|
Type *PtrTy = PtrArg->getType();
|
|
|
|
SmallVector<llvm::Type *, 6> ArgTys;
|
|
for (unsigned I = 0; I != PtrArgLoc; ++I)
|
|
ArgTys.push_back(CI->getArgOperand(I)->getType());
|
|
ArgTys.push_back(PtrTy);
|
|
|
|
Name = Name + "_" + std::to_string(Size);
|
|
auto *FTy = FunctionType::get(Callee->getReturnType(),
|
|
ArrayRef<Type *>(ArgTys), false);
|
|
AMDGPULibFunc NewLibFunc(Name, FTy);
|
|
FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
|
|
if (!F)
|
|
return false;
|
|
|
|
SmallVector<Value *, 6> Args;
|
|
for (unsigned I = 0; I != PtrArgLoc; ++I)
|
|
Args.push_back(CI->getArgOperand(I));
|
|
Args.push_back(PtrArg);
|
|
|
|
auto *NCI = B.CreateCall(F, Args);
|
|
NCI->setAttributes(CI->getAttributes());
|
|
CI->replaceAllUsesWith(NCI);
|
|
CI->dropAllReferences();
|
|
CI->eraseFromParent();
|
|
|
|
return true;
|
|
}
|
|
|
|
// This function returns false if no change; return true otherwise.
|
|
bool AMDGPULibCalls::fold(CallInst *CI) {
|
|
Function *Callee = CI->getCalledFunction();
|
|
// Ignore indirect calls.
|
|
if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
|
|
return false;
|
|
|
|
FuncInfo FInfo;
|
|
if (!parseFunctionName(Callee->getName(), FInfo))
|
|
return false;
|
|
|
|
// Further check the number of arguments to see if they match.
|
|
// TODO: Check calling convention matches too
|
|
if (!FInfo.isCompatibleSignature(*Callee->getParent(), CI->getFunctionType()))
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
|
|
|
|
if (TDOFold(CI, FInfo))
|
|
return true;
|
|
|
|
IRBuilder<> B(CI);
|
|
if (CI->isStrictFP())
|
|
B.setIsFPConstrained(true);
|
|
|
|
if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
|
|
// Under unsafe-math, evaluate calls if possible.
|
|
// According to Brian Sumner, we can do this for all f32 function calls
|
|
// using host's double function calls.
|
|
if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(CI, FInfo))
|
|
return true;
|
|
|
|
// Copy fast flags from the original call.
|
|
FastMathFlags FMF = FPOp->getFastMathFlags();
|
|
B.setFastMathFlags(FMF);
|
|
|
|
// Specialized optimizations for each function call.
|
|
//
|
|
// TODO: Handle native functions
|
|
switch (FInfo.getId()) {
|
|
case AMDGPULibFunc::EI_EXP:
|
|
if (FMF.none())
|
|
return false;
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp,
|
|
FMF.approxFunc());
|
|
case AMDGPULibFunc::EI_EXP2:
|
|
if (FMF.none())
|
|
return false;
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp2,
|
|
FMF.approxFunc());
|
|
case AMDGPULibFunc::EI_LOG:
|
|
if (FMF.none())
|
|
return false;
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log,
|
|
FMF.approxFunc());
|
|
case AMDGPULibFunc::EI_LOG2:
|
|
if (FMF.none())
|
|
return false;
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log2,
|
|
FMF.approxFunc());
|
|
case AMDGPULibFunc::EI_LOG10:
|
|
if (FMF.none())
|
|
return false;
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log10,
|
|
FMF.approxFunc());
|
|
case AMDGPULibFunc::EI_FMIN:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::minnum,
|
|
true, true);
|
|
case AMDGPULibFunc::EI_FMAX:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::maxnum,
|
|
true, true);
|
|
case AMDGPULibFunc::EI_FMA:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fma, true,
|
|
true);
|
|
case AMDGPULibFunc::EI_MAD:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fmuladd,
|
|
true, true);
|
|
case AMDGPULibFunc::EI_FABS:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fabs, true,
|
|
true, true);
|
|
case AMDGPULibFunc::EI_COPYSIGN:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::copysign,
|
|
true, true, true);
|
|
case AMDGPULibFunc::EI_FLOOR:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::floor, true,
|
|
true);
|
|
case AMDGPULibFunc::EI_CEIL:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::ceil, true,
|
|
true);
|
|
case AMDGPULibFunc::EI_TRUNC:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::trunc, true,
|
|
true);
|
|
case AMDGPULibFunc::EI_RINT:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::rint, true,
|
|
true);
|
|
case AMDGPULibFunc::EI_ROUND:
|
|
return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::round, true,
|
|
true);
|
|
case AMDGPULibFunc::EI_LDEXP: {
|
|
if (!shouldReplaceLibcallWithIntrinsic(CI, true, true))
|
|
return false;
|
|
|
|
Value *Arg1 = CI->getArgOperand(1);
|
|
if (VectorType *VecTy = dyn_cast<VectorType>(CI->getType());
|
|
VecTy && !isa<VectorType>(Arg1->getType())) {
|
|
Value *SplatArg1 = B.CreateVectorSplat(VecTy->getElementCount(), Arg1);
|
|
CI->setArgOperand(1, SplatArg1);
|
|
}
|
|
|
|
CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
|
|
CI->getModule(), Intrinsic::ldexp,
|
|
{CI->getType(), CI->getArgOperand(1)->getType()}));
|
|
return true;
|
|
}
|
|
case AMDGPULibFunc::EI_POW:
|
|
case AMDGPULibFunc::EI_POW_FAST:
|
|
return tryOptimizePow(FPOp, B, FInfo);
|
|
case AMDGPULibFunc::EI_POWR:
|
|
case AMDGPULibFunc::EI_POWR_FAST: {
|
|
if (fold_pow(FPOp, B, FInfo))
|
|
return true;
|
|
if (!FMF.approxFunc())
|
|
return false;
|
|
|
|
if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
|
|
getArgType(FInfo) == AMDGPULibFunc::F32) {
|
|
Module *M = Callee->getParent();
|
|
AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
|
|
if (FunctionCallee PowrFastFunc = getFunction(M, PowrFastInfo)) {
|
|
CI->setCalledFunction(PowrFastFunc);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (!shouldReplaceLibcallWithIntrinsic(CI))
|
|
return false;
|
|
return expandFastPow(FPOp, B, PowKind::PowR);
|
|
}
|
|
case AMDGPULibFunc::EI_POWN:
|
|
case AMDGPULibFunc::EI_POWN_FAST: {
|
|
if (fold_pow(FPOp, B, FInfo))
|
|
return true;
|
|
if (!FMF.approxFunc())
|
|
return false;
|
|
|
|
if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
|
|
getArgType(FInfo) == AMDGPULibFunc::F32) {
|
|
Module *M = Callee->getParent();
|
|
AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
|
|
if (FunctionCallee PownFastFunc = getFunction(M, PownFastInfo)) {
|
|
CI->setCalledFunction(PownFastFunc);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (!shouldReplaceLibcallWithIntrinsic(CI))
|
|
return false;
|
|
return expandFastPow(FPOp, B, PowKind::PowN);
|
|
}
|
|
case AMDGPULibFunc::EI_ROOTN:
|
|
case AMDGPULibFunc::EI_ROOTN_FAST: {
|
|
if (fold_rootn(FPOp, B, FInfo))
|
|
return true;
|
|
if (!FMF.approxFunc())
|
|
return false;
|
|
|
|
if (getArgType(FInfo) == AMDGPULibFunc::F32) {
|
|
Module *M = Callee->getParent();
|
|
AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
|
|
if (FunctionCallee RootnFastFunc = getFunction(M, RootnFastInfo)) {
|
|
CI->setCalledFunction(RootnFastFunc);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return expandFastPow(FPOp, B, PowKind::RootN);
|
|
}
|
|
case AMDGPULibFunc::EI_SQRT:
|
|
// TODO: Allow with strictfp + constrained intrinsic
|
|
return tryReplaceLibcallWithSimpleIntrinsic(
|
|
B, CI, Intrinsic::sqrt, true, true, /*AllowStrictFP=*/false);
|
|
case AMDGPULibFunc::EI_COS:
|
|
case AMDGPULibFunc::EI_SIN:
|
|
return fold_sincos(FPOp, B, FInfo);
|
|
default:
|
|
break;
|
|
}
|
|
} else {
|
|
// Specialized optimizations for each function call
|
|
switch (FInfo.getId()) {
|
|
case AMDGPULibFunc::EI_READ_PIPE_2:
|
|
case AMDGPULibFunc::EI_READ_PIPE_4:
|
|
case AMDGPULibFunc::EI_WRITE_PIPE_2:
|
|
case AMDGPULibFunc::EI_WRITE_PIPE_4:
|
|
return fold_read_write_pipe(CI, B, FInfo);
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static Constant *getConstantFloatVector(const ArrayRef<APFloat> Values,
|
|
const Type *Ty) {
|
|
Type *ElemTy = Ty->getScalarType();
|
|
const fltSemantics &FltSem = ElemTy->getFltSemantics();
|
|
|
|
SmallVector<Constant *, 4> ConstValues;
|
|
ConstValues.reserve(Values.size());
|
|
for (APFloat APF : Values) {
|
|
bool Unused;
|
|
APF.convert(FltSem, APFloat::rmNearestTiesToEven, &Unused);
|
|
ConstValues.push_back(ConstantFP::get(ElemTy, APF));
|
|
}
|
|
return ConstantVector::get(ConstValues);
|
|
}
|
|
|
|
bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
|
|
// Table-Driven optimization
|
|
const TableRef tr = getOptTable(FInfo.getId());
|
|
if (tr.empty())
|
|
return false;
|
|
|
|
int const sz = (int)tr.size();
|
|
Value *opr0 = CI->getArgOperand(0);
|
|
|
|
int vecSize = getVecSize(FInfo);
|
|
if (vecSize > 1) {
|
|
// Vector version
|
|
Constant *CV = dyn_cast<Constant>(opr0);
|
|
if (CV && CV->getType()->isVectorTy()) {
|
|
SmallVector<APFloat, 4> Values;
|
|
Values.reserve(vecSize);
|
|
for (int eltNo = 0; eltNo < vecSize; ++eltNo) {
|
|
ConstantFP *eltval =
|
|
cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
|
|
auto MatchingRow = llvm::find_if(tr, [eltval](const TableEntry &entry) {
|
|
return eltval->isExactlyValue(entry.input);
|
|
});
|
|
if (MatchingRow == tr.end())
|
|
return false;
|
|
Values.push_back(APFloat(MatchingRow->result));
|
|
}
|
|
Constant *NewValues = getConstantFloatVector(Values, CI->getType());
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
|
|
replaceCall(CI, NewValues);
|
|
return true;
|
|
}
|
|
} else {
|
|
// Scalar version
|
|
if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
|
|
for (int i = 0; i < sz; ++i) {
|
|
if (CF->isExactlyValue(tr[i].input)) {
|
|
Value *nval = ConstantFP::get(CF->getType(), tr[i].result);
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
|
|
replaceCall(CI, nval);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
namespace llvm {
|
|
static double log2(double V) {
|
|
#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
|
|
return ::log2(V);
|
|
#else
|
|
return log(V) / numbers::ln2;
|
|
#endif
|
|
}
|
|
} // namespace llvm
|
|
|
|
bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
|
|
const FuncInfo &FInfo) {
|
|
assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POW_FAST ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POWR ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POWN ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
|
|
"fold_pow: encounter a wrong function call");
|
|
|
|
Module *M = B.GetInsertBlock()->getModule();
|
|
Type *eltType = FPOp->getType()->getScalarType();
|
|
Value *opr0 = FPOp->getOperand(0);
|
|
Value *opr1 = FPOp->getOperand(1);
|
|
|
|
const APFloat *CF = nullptr;
|
|
const APInt *CINT = nullptr;
|
|
if (!match(opr1, m_APFloatAllowPoison(CF)))
|
|
match(opr1, m_APIntAllowPoison(CINT));
|
|
|
|
// 0x1111111 means that we don't do anything for this call.
|
|
int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
|
|
|
|
if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
|
|
// pow/powr/pown(x, 0) == 1
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
|
|
Constant *cnval = ConstantFP::get(eltType, 1.0);
|
|
if (getVecSize(FInfo) > 1) {
|
|
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
|
|
}
|
|
replaceCall(FPOp, cnval);
|
|
return true;
|
|
}
|
|
if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
|
|
// pow/powr/pown(x, 1.0) = x
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
|
|
replaceCall(FPOp, opr0);
|
|
return true;
|
|
}
|
|
if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
|
|
// pow/powr/pown(x, 2.0) = x*x
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
|
|
<< *opr0 << "\n");
|
|
Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
|
|
replaceCall(FPOp, nval);
|
|
return true;
|
|
}
|
|
if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
|
|
// pow/powr/pown(x, -1.0) = 1.0/x
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
|
|
Constant *cnval = ConstantFP::get(eltType, 1.0);
|
|
if (getVecSize(FInfo) > 1) {
|
|
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
|
|
}
|
|
Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
|
|
replaceCall(FPOp, nval);
|
|
return true;
|
|
}
|
|
|
|
if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
|
|
// pow[r](x, [-]0.5) = sqrt(x)
|
|
bool issqrt = CF->isExactlyValue(0.5);
|
|
if (FunctionCallee FPExpr =
|
|
getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
|
|
: AMDGPULibFunc::EI_RSQRT,
|
|
FInfo))) {
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
|
|
<< '(' << *opr0 << ")\n");
|
|
Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
|
|
: "__pow2rsqrt");
|
|
replaceCall(FPOp, nval);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (!isUnsafeFiniteOnlyMath(FPOp))
|
|
return false;
|
|
|
|
// Unsafe Math optimization
|
|
|
|
// Remember that ci_opr1 is set if opr1 is integral
|
|
if (CF) {
|
|
double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
|
|
? (double)CF->convertToFloat()
|
|
: CF->convertToDouble();
|
|
int ival = (int)dval;
|
|
if ((double)ival == dval) {
|
|
ci_opr1 = ival;
|
|
} else
|
|
ci_opr1 = 0x11111111;
|
|
}
|
|
|
|
// pow/powr/pown(x, c) = [1/](x*x*..x); where
|
|
// trunc(c) == c && the number of x == c && |c| <= 12
|
|
unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
|
|
if (abs_opr1 <= 12) {
|
|
Constant *cnval;
|
|
Value *nval;
|
|
if (abs_opr1 == 0) {
|
|
cnval = ConstantFP::get(eltType, 1.0);
|
|
if (getVecSize(FInfo) > 1) {
|
|
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
|
|
}
|
|
nval = cnval;
|
|
} else {
|
|
Value *valx2 = nullptr;
|
|
nval = nullptr;
|
|
while (abs_opr1 > 0) {
|
|
valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0;
|
|
if (abs_opr1 & 1) {
|
|
nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2;
|
|
}
|
|
abs_opr1 >>= 1;
|
|
}
|
|
}
|
|
|
|
if (ci_opr1 < 0) {
|
|
cnval = ConstantFP::get(eltType, 1.0);
|
|
if (getVecSize(FInfo) > 1) {
|
|
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
|
|
}
|
|
nval = B.CreateFDiv(cnval, nval, "__1powprod");
|
|
}
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
|
|
<< ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
|
|
<< ")\n");
|
|
replaceCall(FPOp, nval);
|
|
return true;
|
|
}
|
|
|
|
// If we should use the generic intrinsic instead of emitting a libcall
|
|
const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
|
|
|
|
// powr ---> exp2(y * log2(x))
|
|
// pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
|
|
FunctionCallee ExpExpr;
|
|
if (ShouldUseIntrinsic)
|
|
ExpExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::exp2,
|
|
{FPOp->getType()});
|
|
else {
|
|
ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
|
|
if (!ExpExpr)
|
|
return false;
|
|
}
|
|
|
|
bool needlog = false;
|
|
bool needabs = false;
|
|
bool needcopysign = false;
|
|
Constant *cnval = nullptr;
|
|
if (getVecSize(FInfo) == 1) {
|
|
CF = nullptr;
|
|
match(opr0, m_APFloatAllowPoison(CF));
|
|
|
|
if (CF) {
|
|
double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
|
|
? (double)CF->convertToFloat()
|
|
: CF->convertToDouble();
|
|
|
|
V = log2(std::abs(V));
|
|
cnval = ConstantFP::get(eltType, V);
|
|
needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
|
|
FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
|
|
CF->isNegative();
|
|
} else {
|
|
needlog = true;
|
|
needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
|
|
FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
|
|
}
|
|
} else {
|
|
ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
|
|
|
|
if (!CDV) {
|
|
needlog = true;
|
|
needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
|
|
FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
|
|
} else {
|
|
assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
|
|
"Wrong vector size detected");
|
|
|
|
SmallVector<double, 0> DVal;
|
|
for (int i=0; i < getVecSize(FInfo); ++i) {
|
|
double V = CDV->getElementAsAPFloat(i).convertToDouble();
|
|
if (V < 0.0) needcopysign = true;
|
|
V = log2(std::abs(V));
|
|
DVal.push_back(V);
|
|
}
|
|
if (getArgType(FInfo) == AMDGPULibFunc::F32) {
|
|
SmallVector<float, 0> FVal;
|
|
for (double D : DVal)
|
|
FVal.push_back((float)D);
|
|
ArrayRef<float> tmp(FVal);
|
|
cnval = ConstantDataVector::get(M->getContext(), tmp);
|
|
} else {
|
|
ArrayRef<double> tmp(DVal);
|
|
cnval = ConstantDataVector::get(M->getContext(), tmp);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
|
|
// We cannot handle corner cases for a general pow() function, give up
|
|
// unless y is a constant integral value. Then proceed as if it were pown.
|
|
if (!isKnownIntegral(opr1, SQ.getWithInstruction(cast<Instruction>(FPOp)),
|
|
FPOp->getFastMathFlags()))
|
|
return false;
|
|
}
|
|
|
|
Value *nval;
|
|
if (needabs) {
|
|
nval = B.CreateUnaryIntrinsic(Intrinsic::fabs, opr0, nullptr, "__fabs");
|
|
} else {
|
|
nval = cnval ? cnval : opr0;
|
|
}
|
|
if (needlog) {
|
|
FunctionCallee LogExpr;
|
|
if (ShouldUseIntrinsic) {
|
|
LogExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::log2,
|
|
{FPOp->getType()});
|
|
} else {
|
|
LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
|
|
if (!LogExpr)
|
|
return false;
|
|
}
|
|
|
|
nval = CreateCallEx(B,LogExpr, nval, "__log2");
|
|
}
|
|
|
|
if (FInfo.getId() == AMDGPULibFunc::EI_POWN ||
|
|
FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
|
|
// convert int(32) to fp(f32 or f64)
|
|
opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
|
|
}
|
|
nval = B.CreateFMul(opr1, nval, "__ylogx");
|
|
|
|
CallInst *Exp2Call = CreateCallEx(B, ExpExpr, nval, "__exp2");
|
|
|
|
// TODO: Generalized fpclass logic for pow
|
|
FPClassTest KnownNot = FPClassTest::fcNegative;
|
|
if (FPOp->hasNoNaNs())
|
|
KnownNot |= FPClassTest::fcNan;
|
|
|
|
Exp2Call->addRetAttr(
|
|
Attribute::getWithNoFPClass(Exp2Call->getContext(), KnownNot));
|
|
nval = Exp2Call;
|
|
|
|
if (needcopysign) {
|
|
Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
|
|
Type *nTy = FPOp->getType()->getWithNewType(nTyS);
|
|
Value *opr_n = FPOp->getOperand(1);
|
|
if (opr_n->getType()->getScalarType()->isIntegerTy())
|
|
opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
|
|
else
|
|
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
|
|
|
|
unsigned size = nTy->getScalarSizeInBits();
|
|
Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
|
|
sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
|
|
|
|
nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
|
|
nullptr, "__pow_sign");
|
|
}
|
|
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
|
|
<< "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
|
|
replaceCall(FPOp, nval);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
|
|
const FuncInfo &FInfo) {
|
|
Value *opr0 = FPOp->getOperand(0);
|
|
Value *opr1 = FPOp->getOperand(1);
|
|
|
|
const APInt *CINT = nullptr;
|
|
if (!match(opr1, m_APIntAllowPoison(CINT)))
|
|
return false;
|
|
|
|
Function *Parent = B.GetInsertBlock()->getParent();
|
|
|
|
int ci_opr1 = (int)CINT->getSExtValue();
|
|
if (ci_opr1 == 1 && !Parent->hasFnAttribute(Attribute::StrictFP)) {
|
|
// rootn(x, 1) = x
|
|
//
|
|
// TODO: Insert constrained canonicalize for strictfp case.
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
|
|
replaceCall(FPOp, opr0);
|
|
return true;
|
|
}
|
|
|
|
Module *M = B.GetInsertBlock()->getModule();
|
|
|
|
CallInst *CI = cast<CallInst>(FPOp);
|
|
if (ci_opr1 == 2 &&
|
|
shouldReplaceLibcallWithIntrinsic(CI,
|
|
/*AllowMinSizeF32=*/true,
|
|
/*AllowF64=*/true)) {
|
|
// rootn(x, 2) = sqrt(x)
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");
|
|
|
|
CallInst *NewCall = B.CreateUnaryIntrinsic(Intrinsic::sqrt, opr0, CI);
|
|
NewCall->takeName(CI);
|
|
|
|
// OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
|
|
// metadata.
|
|
MDBuilder MDHelper(M->getContext());
|
|
MDNode *FPMD = MDHelper.createFPMath(std::max(FPOp->getFPAccuracy(), 2.0f));
|
|
NewCall->setMetadata(LLVMContext::MD_fpmath, FPMD);
|
|
|
|
replaceCall(CI, NewCall);
|
|
return true;
|
|
}
|
|
|
|
if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
|
|
if (FunctionCallee FPExpr =
|
|
getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
|
|
<< ")\n");
|
|
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
|
|
replaceCall(FPOp, nval);
|
|
return true;
|
|
}
|
|
} else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
|
|
Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
|
|
opr0,
|
|
"__rootn2div");
|
|
replaceCall(FPOp, nval);
|
|
return true;
|
|
}
|
|
|
|
if (ci_opr1 == -2 &&
|
|
shouldReplaceLibcallWithIntrinsic(CI,
|
|
/*AllowMinSizeF32=*/true,
|
|
/*AllowF64=*/true)) {
|
|
// rootn(x, -2) = rsqrt(x)
|
|
|
|
// The original rootn had looser ulp requirements than the resultant sqrt
|
|
// and fdiv.
|
|
MDBuilder MDHelper(M->getContext());
|
|
MDNode *FPMD = MDHelper.createFPMath(std::max(FPOp->getFPAccuracy(), 2.0f));
|
|
|
|
// TODO: Could handle strictfp but need to fix strict sqrt emission
|
|
FastMathFlags FMF = FPOp->getFastMathFlags();
|
|
FMF.setAllowContract(true);
|
|
|
|
CallInst *Sqrt = B.CreateUnaryIntrinsic(Intrinsic::sqrt, opr0, CI);
|
|
Instruction *RSqrt = cast<Instruction>(
|
|
B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), Sqrt));
|
|
Sqrt->setFastMathFlags(FMF);
|
|
RSqrt->setFastMathFlags(FMF);
|
|
RSqrt->setMetadata(LLVMContext::MD_fpmath, FPMD);
|
|
|
|
LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
|
|
<< ")\n");
|
|
replaceCall(CI, RSqrt);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// is_integer(y) => trunc(y) == y
|
|
static Value *emitIsInteger(IRBuilder<> &B, Value *Y) {
|
|
Value *TruncY = B.CreateUnaryIntrinsic(Intrinsic::trunc, Y);
|
|
return B.CreateFCmpOEQ(TruncY, Y);
|
|
}
|
|
|
|
static Value *emitIsEvenInteger(IRBuilder<> &B, Value *Y) {
|
|
// Even integers are still integers after division by 2.
|
|
auto *HalfY = B.CreateFMul(Y, ConstantFP::get(Y->getType(), 0.5));
|
|
return emitIsInteger(B, HalfY);
|
|
}
|
|
|
|
// is_odd_integer(y) => is_integer(y) && !is_even_integer(y)
|
|
static Value *emitIsOddInteger(IRBuilder<> &B, Value *Y) {
|
|
Value *IsIntY = emitIsInteger(B, Y);
|
|
Value *IsEvenY = emitIsEvenInteger(B, Y);
|
|
Value *NotEvenY = B.CreateNot(IsEvenY);
|
|
return B.CreateAnd(IsIntY, NotEvenY);
|
|
}
|
|
|
|
// isinf(val) => fabs(val) == +inf
|
|
static Value *emitIsInf(IRBuilder<> &B, Value *val) {
|
|
auto *fabsVal = B.CreateUnaryIntrinsic(Intrinsic::fabs, val);
|
|
return B.CreateFCmpOEQ(fabsVal, ConstantFP::getInfinity(val->getType()));
|
|
}
|
|
|
|
// y * log2(fabs(x))
|
|
static Value *emitFastExpYLnx(IRBuilder<> &B, Value *X, Value *Y) {
|
|
Value *AbsX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
|
|
Value *LogAbsX = B.CreateUnaryIntrinsic(Intrinsic::log2, AbsX);
|
|
Value *YTimesLogX = B.CreateFMul(Y, LogAbsX);
|
|
return B.CreateUnaryIntrinsic(Intrinsic::exp2, YTimesLogX);
|
|
}
|
|
|
|
/// Emit special case management epilog code for fast pow, powr, pown, and rootn
|
|
/// expansions. \p x and \p y should be the arguments to the library call
|
|
/// (possibly with some values clamped). \p expylnx should be the result to use
|
|
/// in normal circumstances.
|
|
static Value *emitPowFixup(IRBuilder<> &B, Value *X, Value *Y, Value *ExpYLnX,
|
|
PowKind Kind) {
|
|
Constant *Zero = ConstantFP::getZero(X->getType());
|
|
Constant *One = ConstantFP::get(X->getType(), 1.0);
|
|
Constant *QNaN = ConstantFP::getQNaN(X->getType());
|
|
Constant *PInf = ConstantFP::getInfinity(X->getType());
|
|
|
|
switch (Kind) {
|
|
case PowKind::Pow: {
|
|
// is_odd_integer(y)
|
|
Value *IsOddY = emitIsOddInteger(B, Y);
|
|
|
|
// ret = copysign(expylnx, is_odd_y ? x : 1.0f)
|
|
Value *SelSign = B.CreateSelect(IsOddY, X, One);
|
|
Value *Ret = B.CreateCopySign(ExpYLnX, SelSign);
|
|
|
|
// if (x < 0 && !is_integer(y)) ret = QNAN
|
|
Value *IsIntY = emitIsInteger(B, Y);
|
|
Value *condNegX = B.CreateFCmpOLT(X, Zero);
|
|
Value *condNotIntY = B.CreateNot(IsIntY);
|
|
Value *condNaN = B.CreateAnd(condNegX, condNotIntY);
|
|
Ret = B.CreateSelect(condNaN, QNaN, Ret);
|
|
|
|
// if (isinf(ay)) { ... }
|
|
|
|
// FIXME: Missing backend optimization to save on materialization cost of
|
|
// mixed sign constant infinities.
|
|
Value *YIsInf = emitIsInf(B, Y);
|
|
|
|
Value *AY = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y);
|
|
Value *YIsNegInf = B.CreateFCmpUNE(Y, AY);
|
|
|
|
Value *AX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
|
|
Value *AxEqOne = B.CreateFCmpOEQ(AX, One);
|
|
Value *AxLtOne = B.CreateFCmpOLT(AX, One);
|
|
Value *XorCond = B.CreateXor(AxLtOne, YIsNegInf);
|
|
Value *SelInf =
|
|
B.CreateSelect(AxEqOne, AX, B.CreateSelect(XorCond, Zero, AY));
|
|
Ret = B.CreateSelect(YIsInf, SelInf, Ret);
|
|
|
|
// if (isinf(ax) || x == 0.0f) { ... }
|
|
Value *XIsInf = emitIsInf(B, X);
|
|
Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
|
|
Value *AxInfOrZero = B.CreateOr(XIsInf, XEqZero);
|
|
Value *YLtZero = B.CreateFCmpOLT(Y, Zero);
|
|
Value *XorZeroInf = B.CreateXor(XEqZero, YLtZero);
|
|
Value *SelVal = B.CreateSelect(XorZeroInf, Zero, PInf);
|
|
Value *SelSign2 = B.CreateSelect(IsOddY, X, Zero);
|
|
Value *Copysign = B.CreateCopySign(SelVal, SelSign2);
|
|
Ret = B.CreateSelect(AxInfOrZero, Copysign, Ret);
|
|
|
|
// if (isunordered(x, y)) ret = QNAN
|
|
Value *isUnordered = B.CreateFCmpUNO(X, Y);
|
|
return B.CreateSelect(isUnordered, QNaN, Ret);
|
|
}
|
|
case PowKind::PowR: {
|
|
Value *YIsNeg = B.CreateFCmpOLT(Y, Zero);
|
|
Value *IZ = B.CreateSelect(YIsNeg, PInf, Zero);
|
|
Value *ZI = B.CreateSelect(YIsNeg, Zero, PInf);
|
|
|
|
Value *YEqZero = B.CreateFCmpOEQ(Y, Zero);
|
|
Value *SelZeroCase = B.CreateSelect(YEqZero, QNaN, IZ);
|
|
Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
|
|
Value *Ret = B.CreateSelect(XEqZero, SelZeroCase, ExpYLnX);
|
|
|
|
Value *XEqInf = B.CreateFCmpOEQ(X, PInf);
|
|
Value *YNeZero = B.CreateFCmpUNE(Y, Zero);
|
|
Value *CondInfCase = B.CreateAnd(XEqInf, YNeZero);
|
|
Ret = B.CreateSelect(CondInfCase, ZI, Ret);
|
|
|
|
Value *IsInfY = emitIsInf(B, Y);
|
|
Value *XNeOne = B.CreateFCmpUNE(X, One);
|
|
Value *CondInfY = B.CreateAnd(IsInfY, XNeOne);
|
|
Value *XLtOne = B.CreateFCmpOLT(X, One);
|
|
Value *SelInfYCase = B.CreateSelect(XLtOne, IZ, ZI);
|
|
Ret = B.CreateSelect(CondInfY, SelInfYCase, Ret);
|
|
|
|
Value *IsUnordered = B.CreateFCmpUNO(X, Y);
|
|
return B.CreateSelect(IsUnordered, QNaN, Ret);
|
|
}
|
|
case PowKind::PowN: {
|
|
Constant *ZeroI = ConstantInt::get(Y->getType(), 0);
|
|
|
|
// is_odd_y = (ny & 1) != 0
|
|
Value *OneI = ConstantInt::get(Y->getType(), 1);
|
|
Value *YAnd1 = B.CreateAnd(Y, OneI);
|
|
Value *IsOddY = B.CreateICmpNE(YAnd1, ZeroI);
|
|
|
|
// ret = copysign(expylnx, is_odd_y ? x : 1.0f)
|
|
Value *SelSign = B.CreateSelect(IsOddY, X, One);
|
|
Value *Ret = B.CreateCopySign(ExpYLnX, SelSign);
|
|
|
|
// if (isinf(x) || x == 0.0f)
|
|
Value *FabsX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
|
|
Value *XIsInf = B.CreateFCmpOEQ(FabsX, PInf);
|
|
Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
|
|
Value *InfOrZero = B.CreateOr(XIsInf, XEqZero);
|
|
|
|
// (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
|
|
Value *YLtZero = B.CreateICmpSLT(Y, ZeroI);
|
|
Value *XorZeroInf = B.CreateXor(XEqZero, YLtZero);
|
|
Value *SelVal = B.CreateSelect(XorZeroInf, Zero, PInf);
|
|
|
|
// copysign(selVal, is_odd_y ? x : 0.0f)
|
|
Value *SelSign2 = B.CreateSelect(IsOddY, X, Zero);
|
|
Value *Copysign = B.CreateCopySign(SelVal, SelSign2);
|
|
|
|
return B.CreateSelect(InfOrZero, Copysign, Ret);
|
|
}
|
|
case PowKind::RootN: {
|
|
Constant *ZeroI = ConstantInt::get(Y->getType(), 0);
|
|
|
|
// is_odd_y = (ny & 1) != 0
|
|
Value *YAnd1 = B.CreateAnd(Y, ConstantInt::get(Y->getType(), 1));
|
|
Value *IsOddY = B.CreateICmpNE(YAnd1, ZeroI);
|
|
|
|
// ret = copysign(expylnx, is_odd_y ? x : 1.0f)
|
|
Value *SelSign = B.CreateSelect(IsOddY, X, One);
|
|
Value *Ret = B.CreateCopySign(ExpYLnX, SelSign);
|
|
|
|
// if (isinf(x) || x == 0.0f)
|
|
Value *FabsX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
|
|
Value *IsInfX = B.CreateFCmpOEQ(FabsX, PInf);
|
|
Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
|
|
Value *CondInfOrZero = B.CreateOr(IsInfX, XEqZero);
|
|
|
|
// (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
|
|
Value *YLtZero = B.CreateICmpSLT(Y, ZeroI);
|
|
Value *XorZeroInf = B.CreateXor(XEqZero, YLtZero);
|
|
Value *SelVal = B.CreateSelect(XorZeroInf, Zero, PInf);
|
|
|
|
// copysign(selVal, is_odd_y ? x : 0.0f)
|
|
Value *SelSign2 = B.CreateSelect(IsOddY, X, Zero);
|
|
Value *Copysign = B.CreateCopySign(SelVal, SelSign2);
|
|
|
|
Ret = B.CreateSelect(CondInfOrZero, Copysign, Ret);
|
|
|
|
// if ((x < 0.0f && !is_odd_y) || ny == 0) ret = QNAN
|
|
Value *XIsNeg = B.CreateFCmpOLT(X, Zero);
|
|
Value *NotOddY = B.CreateNot(IsOddY);
|
|
Value *CondNegAndNotOdd = B.CreateAnd(XIsNeg, NotOddY);
|
|
Value *YEqZero = B.CreateICmpEQ(Y, ZeroI);
|
|
Value *CondBad = B.CreateOr(CondNegAndNotOdd, YEqZero);
|
|
return B.CreateSelect(CondBad, QNaN, Ret);
|
|
}
|
|
}
|
|
|
|
llvm_unreachable("covered switch");
|
|
}
|
|
|
|
// TODO: Move the fold_pow folding to sqrt/fdiv here
|
|
bool AMDGPULibCalls::expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B,
|
|
PowKind Kind) {
|
|
Type *Ty = FPOp->getType();
|
|
|
|
// There's currently no reason to do this for half. The correct path is
|
|
// promote to float and use the fast float expansion.
|
|
//
|
|
// TODO: We could move this expansion to lowering to get half pow to work.
|
|
if (!Ty->getScalarType()->isFloatTy())
|
|
return false;
|
|
|
|
// TODO: Verify optimization for double and bfloat.
|
|
Value *X = FPOp->getOperand(0);
|
|
Value *Y = FPOp->getOperand(1);
|
|
|
|
switch (Kind) {
|
|
case PowKind::Pow: {
|
|
Constant *One = ConstantFP::get(X->getType(), 1.0);
|
|
|
|
// if (x == 1.0f) y = 1.0f;
|
|
Value *XEqOne = B.CreateFCmpOEQ(X, One);
|
|
Y = B.CreateSelect(XEqOne, One, Y);
|
|
|
|
// if (y == 0.0f) x = 1.0f;
|
|
Value *YEqZero = B.CreateFCmpOEQ(Y, ConstantFP::getZero(X->getType()));
|
|
X = B.CreateSelect(YEqZero, One, X);
|
|
|
|
Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
|
|
Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
|
|
replaceCall(FPOp, Fixed);
|
|
return true;
|
|
}
|
|
case PowKind::PowR: {
|
|
Value *NegX = B.CreateFCmpOLT(X, ConstantFP::getZero(X->getType()));
|
|
X = B.CreateSelect(NegX, ConstantFP::getQNaN(X->getType()), X);
|
|
|
|
Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
|
|
Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
|
|
replaceCall(FPOp, Fixed);
|
|
return true;
|
|
}
|
|
case PowKind::PowN: {
|
|
// ny == 0
|
|
Value *YEqZero = B.CreateICmpEQ(Y, ConstantInt::get(Y->getType(), 0));
|
|
|
|
// x = (ny == 0 ? 1.0f : x)
|
|
X = B.CreateSelect(YEqZero, ConstantFP::get(X->getType(), 1.0), X);
|
|
|
|
Value *CastY = B.CreateSIToFP(Y, X->getType());
|
|
Value *ExpYLnX = emitFastExpYLnx(B, X, CastY);
|
|
Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
|
|
replaceCall(FPOp, Fixed);
|
|
return true;
|
|
}
|
|
case PowKind::RootN: {
|
|
Value *CastY = B.CreateSIToFP(Y, X->getType());
|
|
Value *RcpY = B.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, CastY);
|
|
Value *ExpYLnX = emitFastExpYLnx(B, X, RcpY);
|
|
Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
|
|
replaceCall(FPOp, Fixed);
|
|
return true;
|
|
}
|
|
}
|
|
llvm_unreachable("Unhandled PowKind enum");
|
|
}
|
|
|
|
bool AMDGPULibCalls::tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
|
|
const FuncInfo &FInfo) {
|
|
FastMathFlags FMF = FPOp->getFastMathFlags();
|
|
CallInst *Call = cast<CallInst>(FPOp);
|
|
Module *M = Call->getModule();
|
|
|
|
FuncInfo PowrInfo;
|
|
AMDGPULibFunc::EFuncId FastPowrFuncId =
|
|
FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
|
|
? AMDGPULibFunc::EI_POWR_FAST
|
|
: AMDGPULibFunc::EI_NONE;
|
|
FunctionCallee PowrFunc = getFloatFastVariant(
|
|
M, FInfo, PowrInfo, AMDGPULibFunc::EI_POWR, FastPowrFuncId);
|
|
|
|
// TODO: Prefer fast pown to fast powr, but slow powr to slow pown.
|
|
|
|
// pow(x, y) -> powr(x, y) for x >= -0.0
|
|
// TODO: Account for flags on current call
|
|
if (PowrFunc && cannotBeOrderedLessThanZero(FPOp->getOperand(0),
|
|
SQ.getWithInstruction(Call))) {
|
|
Call->setCalledFunction(PowrFunc);
|
|
return fold_pow(FPOp, B, PowrInfo) || true;
|
|
}
|
|
|
|
// pow(x, y) -> pown(x, y) for known integral y
|
|
if (isKnownIntegral(FPOp->getOperand(1), SQ.getWithInstruction(Call),
|
|
FPOp->getFastMathFlags())) {
|
|
FunctionType *PownType = getPownType(Call->getFunctionType());
|
|
|
|
FuncInfo PownInfo;
|
|
AMDGPULibFunc::EFuncId FastPownFuncId =
|
|
FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
|
|
? AMDGPULibFunc::EI_POWN_FAST
|
|
: AMDGPULibFunc::EI_NONE;
|
|
FunctionCallee PownFunc = getFloatFastVariant(
|
|
M, FInfo, PownInfo, AMDGPULibFunc::EI_POWN, FastPownFuncId);
|
|
|
|
if (PownFunc) {
|
|
// TODO: If the incoming integral value is an sitofp/uitofp, it won't
|
|
// fold out without a known range. We can probably take the source
|
|
// value directly.
|
|
Value *CastedArg =
|
|
B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1));
|
|
// Have to drop any nofpclass attributes on the original call site.
|
|
Call->removeParamAttrs(
|
|
1, AttributeFuncs::typeIncompatible(CastedArg->getType(),
|
|
Call->getParamAttributes(1)));
|
|
Call->setCalledFunction(PownFunc);
|
|
Call->setArgOperand(1, CastedArg);
|
|
return fold_pow(FPOp, B, PownInfo) || true;
|
|
}
|
|
}
|
|
|
|
if (fold_pow(FPOp, B, FInfo))
|
|
return true;
|
|
|
|
if (!FMF.approxFunc())
|
|
return false;
|
|
|
|
if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
|
|
getArgType(FInfo) == AMDGPULibFunc::F32) {
|
|
AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
|
|
if (FunctionCallee PowFastFunc = getFunction(M, PowFastInfo)) {
|
|
Call->setCalledFunction(PowFastFunc);
|
|
return fold_pow(FPOp, B, PowFastInfo) || true;
|
|
}
|
|
}
|
|
|
|
return expandFastPow(FPOp, B, PowKind::Pow);
|
|
}
|
|
|
|
// Get a scalar native builtin single argument FP function
|
|
FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
|
|
const FuncInfo &FInfo) {
|
|
if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
|
|
return nullptr;
|
|
FuncInfo nf = FInfo;
|
|
nf.setPrefix(AMDGPULibFunc::NATIVE);
|
|
return getFunction(M, nf);
|
|
}
|
|
|
|
// Some library calls are just wrappers around llvm intrinsics, but compiled
|
|
// conservatively. Preserve the flags from the original call site by
|
|
// substituting them with direct calls with all the flags.
|
|
bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
|
|
bool AllowMinSizeF32,
|
|
bool AllowF64,
|
|
bool AllowStrictFP) {
|
|
Type *FltTy = CI->getType()->getScalarType();
|
|
const bool IsF32 = FltTy->isFloatTy();
|
|
|
|
// f64 intrinsics aren't implemented for most operations.
|
|
if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
|
|
return false;
|
|
|
|
// We're implicitly inlining by replacing the libcall with the intrinsic, so
|
|
// don't do it for noinline call sites.
|
|
if (CI->isNoInline())
|
|
return false;
|
|
|
|
const Function *ParentF = CI->getFunction();
|
|
// TODO: Handle strictfp
|
|
if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP))
|
|
return false;
|
|
|
|
if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
|
|
CallInst *CI,
|
|
Intrinsic::ID IntrID) {
|
|
if (CI->arg_size() == 2) {
|
|
Value *Arg0 = CI->getArgOperand(0);
|
|
Value *Arg1 = CI->getArgOperand(1);
|
|
VectorType *Arg0VecTy = dyn_cast<VectorType>(Arg0->getType());
|
|
VectorType *Arg1VecTy = dyn_cast<VectorType>(Arg1->getType());
|
|
if (Arg0VecTy && !Arg1VecTy) {
|
|
Value *SplatRHS = B.CreateVectorSplat(Arg0VecTy->getElementCount(), Arg1);
|
|
CI->setArgOperand(1, SplatRHS);
|
|
} else if (!Arg0VecTy && Arg1VecTy) {
|
|
Value *SplatLHS = B.CreateVectorSplat(Arg1VecTy->getElementCount(), Arg0);
|
|
CI->setArgOperand(0, SplatLHS);
|
|
}
|
|
}
|
|
|
|
CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
|
|
CI->getModule(), IntrID, {CI->getType()}));
|
|
}
|
|
|
|
bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
|
|
IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
|
|
bool AllowF64, bool AllowStrictFP) {
|
|
if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
|
|
AllowStrictFP))
|
|
return false;
|
|
replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
|
|
return true;
|
|
}
|
|
|
|
std::tuple<Value *, Value *, Value *>
|
|
AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
|
|
FunctionCallee Fsincos) {
|
|
DebugLoc DL = B.getCurrentDebugLocation();
|
|
Function *F = B.GetInsertBlock()->getParent();
|
|
B.SetInsertPointPastAllocas(F);
|
|
|
|
AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_");
|
|
|
|
if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
|
|
// If the argument is an instruction, it must dominate all uses so put our
|
|
// sincos call there. Otherwise, right after the allocas works well enough
|
|
// if it's an argument or constant.
|
|
|
|
B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
|
|
|
|
// SetInsertPoint unwelcomely always tries to set the debug loc.
|
|
B.SetCurrentDebugLocation(DL);
|
|
}
|
|
|
|
Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);
|
|
|
|
// The allocaInst allocates the memory in private address space. This need
|
|
// to be addrspacecasted to point to the address space of cos pointer type.
|
|
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
|
|
Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);
|
|
|
|
CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);
|
|
|
|
// TODO: Is it worth trying to preserve the location for the cos calls for the
|
|
// load?
|
|
|
|
LoadInst *LoadCos = B.CreateLoad(Arg->getType(), Alloc);
|
|
return {SinCos, LoadCos, SinCos};
|
|
}
|
|
|
|
// fold sin, cos -> sincos.
|
|
bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
|
|
const FuncInfo &fInfo) {
|
|
assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
|
|
fInfo.getId() == AMDGPULibFunc::EI_COS);
|
|
|
|
if ((getArgType(fInfo) != AMDGPULibFunc::F32 &&
|
|
getArgType(fInfo) != AMDGPULibFunc::F64) ||
|
|
fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
|
|
return false;
|
|
|
|
bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
|
|
|
|
Value *CArgVal = FPOp->getOperand(0);
|
|
|
|
// TODO: Constant fold the call
|
|
if (isa<ConstantData>(CArgVal))
|
|
return false;
|
|
|
|
CallInst *CI = cast<CallInst>(FPOp);
|
|
|
|
Function *F = B.GetInsertBlock()->getParent();
|
|
Module *M = F->getParent();
|
|
|
|
// Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
|
|
// implementation. Prefer the private form if available.
|
|
AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
|
|
SinCosLibFuncPrivate.getLeads()[0].PtrKind =
|
|
AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);
|
|
|
|
AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
|
|
SinCosLibFuncGeneric.getLeads()[0].PtrKind =
|
|
AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
|
|
|
|
FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
|
|
FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
|
|
FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
|
|
if (!FSinCos)
|
|
return false;
|
|
|
|
SmallVector<CallInst *> SinCalls;
|
|
SmallVector<CallInst *> CosCalls;
|
|
SmallVector<CallInst *> SinCosCalls;
|
|
FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
|
|
fInfo);
|
|
const std::string PairName = PartnerInfo.mangle();
|
|
|
|
StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
|
|
StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
|
|
const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
|
|
const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
|
|
|
|
// Intersect the two sets of flags.
|
|
FastMathFlags FMF = FPOp->getFastMathFlags();
|
|
MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath);
|
|
|
|
SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
|
|
|
|
for (User* U : CArgVal->users()) {
|
|
CallInst *XI = dyn_cast<CallInst>(U);
|
|
if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
|
|
continue;
|
|
|
|
Function *UCallee = XI->getCalledFunction();
|
|
if (!UCallee)
|
|
continue;
|
|
|
|
bool Handled = true;
|
|
|
|
if (UCallee->getName() == SinName)
|
|
SinCalls.push_back(XI);
|
|
else if (UCallee->getName() == CosName)
|
|
CosCalls.push_back(XI);
|
|
else if (UCallee->getName() == SinCosPrivateName ||
|
|
UCallee->getName() == SinCosGenericName)
|
|
SinCosCalls.push_back(XI);
|
|
else
|
|
Handled = false;
|
|
|
|
if (Handled) {
|
|
MergeDbgLocs.push_back(XI->getDebugLoc());
|
|
auto *OtherOp = cast<FPMathOperator>(XI);
|
|
FMF &= OtherOp->getFastMathFlags();
|
|
FPMath = MDNode::getMostGenericFPMath(
|
|
FPMath, XI->getMetadata(LLVMContext::MD_fpmath));
|
|
}
|
|
}
|
|
|
|
if (SinCalls.empty() || CosCalls.empty())
|
|
return false;
|
|
|
|
B.setFastMathFlags(FMF);
|
|
B.setDefaultFPMathTag(FPMath);
|
|
DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs);
|
|
B.SetCurrentDebugLocation(DbgLoc);
|
|
|
|
auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos);
|
|
|
|
auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
|
|
for (CallInst *C : Calls)
|
|
C->replaceAllUsesWith(Res);
|
|
|
|
// Leave the other dead instructions to avoid clobbering iterators.
|
|
};
|
|
|
|
replaceTrigInsts(SinCalls, Sin);
|
|
replaceTrigInsts(CosCalls, Cos);
|
|
replaceTrigInsts(SinCosCalls, SinCos);
|
|
|
|
// It's safe to delete the original now.
|
|
CI->eraseFromParent();
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
|
|
APFloat &Res0, APFloat &Res1,
|
|
Constant *copr0, Constant *copr1) {
|
|
// By default, opr0/opr1/opr3 holds values of float/double type.
|
|
// If they are not float/double, each function has to its
|
|
// operand separately.
|
|
double opr0 = 0.0, opr1 = 0.0;
|
|
ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
|
|
ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
|
|
if (fpopr0) {
|
|
opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
|
|
? fpopr0->getValueAPF().convertToDouble()
|
|
: (double)fpopr0->getValueAPF().convertToFloat();
|
|
}
|
|
|
|
if (fpopr1) {
|
|
opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
|
|
? fpopr1->getValueAPF().convertToDouble()
|
|
: (double)fpopr1->getValueAPF().convertToFloat();
|
|
}
|
|
|
|
switch (FInfo.getId()) {
|
|
default:
|
|
return false;
|
|
|
|
case AMDGPULibFunc::EI_ACOS:
|
|
Res0 = APFloat{acos(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ACOSH:
|
|
// acosh(x) == log(x + sqrt(x*x - 1))
|
|
Res0 = APFloat{log(opr0 + sqrt(opr0 * opr0 - 1.0))};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ACOSPI:
|
|
Res0 = APFloat{acos(opr0) / MATH_PI};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ASIN:
|
|
Res0 = APFloat{asin(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ASINH:
|
|
// asinh(x) == log(x + sqrt(x*x + 1))
|
|
Res0 = APFloat{log(opr0 + sqrt(opr0 * opr0 + 1.0))};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ASINPI:
|
|
Res0 = APFloat{asin(opr0) / MATH_PI};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ATAN:
|
|
Res0 = APFloat{atan(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ATANH:
|
|
// atanh(x) == (log(x+1) - log(x-1))/2;
|
|
Res0 = APFloat{(log(opr0 + 1.0) - log(opr0 - 1.0)) / 2.0};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_ATANPI:
|
|
Res0 = APFloat{atan(opr0) / MATH_PI};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_CBRT:
|
|
Res0 =
|
|
APFloat{(opr0 < 0.0) ? -pow(-opr0, 1.0 / 3.0) : pow(opr0, 1.0 / 3.0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_COS:
|
|
Res0 = APFloat{cos(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_COSH:
|
|
Res0 = APFloat{cosh(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_COSPI:
|
|
Res0 = APFloat{cos(MATH_PI * opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_EXP:
|
|
Res0 = APFloat{exp(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_EXP2:
|
|
Res0 = APFloat{pow(2.0, opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_EXP10:
|
|
Res0 = APFloat{pow(10.0, opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_LOG:
|
|
Res0 = APFloat{log(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_LOG2:
|
|
Res0 = APFloat{log(opr0) / log(2.0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_LOG10:
|
|
Res0 = APFloat{log(opr0) / log(10.0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_RSQRT:
|
|
Res0 = APFloat{1.0 / sqrt(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_SIN:
|
|
Res0 = APFloat{sin(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_SINH:
|
|
Res0 = APFloat{sinh(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_SINPI:
|
|
Res0 = APFloat{sin(MATH_PI * opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_TAN:
|
|
Res0 = APFloat{tan(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_TANH:
|
|
Res0 = APFloat{tanh(opr0)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_TANPI:
|
|
Res0 = APFloat{tan(MATH_PI * opr0)};
|
|
return true;
|
|
|
|
// two-arg functions
|
|
case AMDGPULibFunc::EI_POW:
|
|
case AMDGPULibFunc::EI_POWR:
|
|
Res0 = APFloat{pow(opr0, opr1)};
|
|
return true;
|
|
|
|
case AMDGPULibFunc::EI_POWN: {
|
|
if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
|
|
double val = (double)iopr1->getSExtValue();
|
|
Res0 = APFloat{pow(opr0, val)};
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
case AMDGPULibFunc::EI_ROOTN: {
|
|
if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
|
|
double val = (double)iopr1->getSExtValue();
|
|
Res0 = APFloat{pow(opr0, 1.0 / val)};
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// with ptr arg
|
|
case AMDGPULibFunc::EI_SINCOS:
|
|
Res0 = APFloat{sin(opr0)};
|
|
Res1 = APFloat{cos(opr0)};
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
|
|
int numArgs = (int)aCI->arg_size();
|
|
if (numArgs > 3)
|
|
return false;
|
|
|
|
Constant *copr0 = nullptr;
|
|
Constant *copr1 = nullptr;
|
|
if (numArgs > 0) {
|
|
if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
|
|
return false;
|
|
}
|
|
|
|
if (numArgs > 1) {
|
|
if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) {
|
|
if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// At this point, all arguments to aCI are constants.
|
|
|
|
// max vector size is 16, and sincos will generate two results.
|
|
SmallVector<APFloat, 16> Val0, Val1;
|
|
int FuncVecSize = getVecSize(FInfo);
|
|
bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
|
|
if (FuncVecSize == 1) {
|
|
if (!evaluateScalarMathFunc(FInfo, Val0.emplace_back(0.0),
|
|
Val1.emplace_back(0.0), copr0, copr1)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
|
|
ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
|
|
for (int i = 0; i < FuncVecSize; ++i) {
|
|
Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
|
|
Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
|
|
if (!evaluateScalarMathFunc(FInfo, Val0.emplace_back(0.0),
|
|
Val1.emplace_back(0.0), celt0, celt1)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
Constant *nval0, *nval1;
|
|
if (FuncVecSize == 1) {
|
|
nval0 = ConstantFP::get(aCI->getType(), Val0[0]);
|
|
if (hasTwoResults)
|
|
nval1 = ConstantFP::get(aCI->getType(), Val1[0]);
|
|
} else {
|
|
nval0 = getConstantFloatVector(Val0, aCI->getType());
|
|
if (hasTwoResults)
|
|
nval1 = getConstantFloatVector(Val1, aCI->getType());
|
|
}
|
|
|
|
if (hasTwoResults) {
|
|
// sincos
|
|
assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
|
|
"math function with ptr arg not supported yet");
|
|
new StoreInst(nval1, aCI->getArgOperand(1), aCI->getIterator());
|
|
}
|
|
|
|
replaceCall(aCI, nval0);
|
|
return true;
|
|
}
|
|
|
|
PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
|
|
FunctionAnalysisManager &AM) {
|
|
AMDGPULibCalls Simplifier(F, AM);
|
|
Simplifier.initNativeFuncs();
|
|
|
|
bool Changed = false;
|
|
|
|
LLVM_DEBUG(dbgs() << "AMDIC: process function ";
|
|
F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
|
|
|
|
for (auto &BB : F) {
|
|
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
|
|
// Ignore non-calls.
|
|
CallInst *CI = dyn_cast<CallInst>(I);
|
|
++I;
|
|
|
|
if (CI) {
|
|
if (Simplifier.fold(CI))
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
|
}
|
|
|
|
PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
|
|
FunctionAnalysisManager &AM) {
|
|
if (UseNative.empty())
|
|
return PreservedAnalyses::all();
|
|
|
|
AMDGPULibCalls Simplifier(F, AM);
|
|
Simplifier.initNativeFuncs();
|
|
|
|
bool Changed = false;
|
|
for (auto &BB : F) {
|
|
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
|
|
// Ignore non-calls.
|
|
CallInst *CI = dyn_cast<CallInst>(I);
|
|
++I;
|
|
if (CI && Simplifier.useNative(CI))
|
|
Changed = true;
|
|
}
|
|
}
|
|
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
|
}
|