llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp

//===- AMDGPULibCalls.cpp -------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This file does AMD library function optimizations.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include <cmath>

#define DEBUG_TYPE "amdgpu-simplifylib"

using namespace llvm;
using namespace llvm::PatternMatch;

static cl::opt<bool> EnablePreLink("amdgpu-prelink",
  cl::desc("Enable pre-link mode optimizations"),
  cl::init(false),
  cl::Hidden);

static cl::list<std::string> UseNative("amdgpu-use-native",
  cl::desc("Comma separated list of functions to replace with native, or all"),
  cl::CommaSeparated, cl::ValueOptional,
  cl::Hidden);

#define MATH_PI      numbers::pi
#define MATH_E       numbers::e
#define MATH_SQRT2   numbers::sqrt2
#define MATH_SQRT1_2 numbers::inv_sqrt2

enum class PowKind { Pow, PowR, PowN, RootN };

namespace llvm {

class AMDGPULibCalls {
private:
  SimplifyQuery SQ;

  using FuncInfo = llvm::AMDGPULibFunc;

  // -fuse-native.
  bool AllNative = false;

  bool useNativeFunc(const StringRef F) const;

  // Return a pointer (pointer expr) to the function if function definition with
  // "FuncName" exists. It may create a new function prototype in pre-link mode.
  FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);

  /// Wrapper around getFunction which tries to use a faster variant if
  /// available, and falls back to a less fast option.
  ///
  /// Return a replacement function for \p fInfo that has float-typed fast
  /// variants. \p NewFunc is a base replacement function to use. \p
  /// NewFuncFastVariant is a faster version to use if the calling context knows
  /// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
  /// should be EI_NONE.
  FunctionCallee getFloatFastVariant(Module *M, const FuncInfo &fInfo,
                                     FuncInfo &newInfo,
                                     AMDGPULibFunc::EFuncId NewFunc,
                                     AMDGPULibFunc::EFuncId NewFuncFastVariant);

  bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);

  bool TDOFold(CallInst *CI, const FuncInfo &FInfo);

  /* Specialized optimizations */

  // pow/powr/pown
  bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);

  /// Peform a fast math expansion of pow, powr, pown or rootn.
  bool expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B, PowKind Kind);

  bool tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
                      const FuncInfo &FInfo);

  // rootn
  bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);

  // -fuse-native for sincos
  bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);

  // evaluate calls if calls' arguments are constants.
  bool evaluateScalarMathFunc(const FuncInfo &FInfo, APFloat &Res0,
                              APFloat &Res1, Constant *copr0, Constant *copr1);
  bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);

  /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
  /// of cos, sincos call).
  std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
                                                     FastMathFlags FMF,
                                                     IRBuilder<> &B,
                                                     FunctionCallee Fsincos);

  // sin/cos
  bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);

  // __read_pipe/__write_pipe
  bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
                            const FuncInfo &FInfo);

  // Get a scalar native builtin single argument FP function
  FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);

  /// Substitute a call to a known libcall with an intrinsic call. If \p
  /// AllowMinSize is true, allow the replacement in a minsize function.
  bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
                                         bool AllowMinSizeF32 = false,
                                         bool AllowF64 = false,
                                         bool AllowStrictFP = false);
  void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
                                         Intrinsic::ID IntrID);

  bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
                                            Intrinsic::ID IntrID,
                                            bool AllowMinSizeF32 = false,
                                            bool AllowF64 = false,
                                            bool AllowStrictFP = false);

protected:
  bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;

  bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;

  static void replaceCall(Instruction *I, Value *With) {
    I->replaceAllUsesWith(With);
    I->eraseFromParent();
  }

  static void replaceCall(FPMathOperator *I, Value *With) {
    replaceCall(cast<Instruction>(I), With);
  }

public:
  AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);

  bool fold(CallInst *CI);

  void initNativeFuncs();

  // Replace a normal math function call with that native version
  bool useNative(CallInst *CI);
};

} // end namespace llvm

template <typename IRB>
static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
                              const Twine &Name = "") {
  CallInst *R = B.CreateCall(Callee, Arg, Name);
  if (Function *F = dyn_cast<Function>(Callee.getCallee()))
    R->setCallingConv(F->getCallingConv());
  return R;
}

template <typename IRB>
static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
                               Value *Arg2, const Twine &Name = "") {
  CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
  if (Function *F = dyn_cast<Function>(Callee.getCallee()))
    R->setCallingConv(F->getCallingConv());
  return R;
}

static FunctionType *getPownType(FunctionType *FT) {
  Type *PowNExpTy = Type::getInt32Ty(FT->getContext());
  if (VectorType *VecTy = dyn_cast<VectorType>(FT->getReturnType()))
    PowNExpTy = VectorType::get(PowNExpTy, VecTy->getElementCount());

  return FunctionType::get(FT->getReturnType(),
                           {FT->getParamType(0), PowNExpTy}, false);
}

//  Data structures for table-driven optimizations.
//  FuncTbl works for both f32 and f64 functions with 1 input argument

struct TableEntry {
  double   result;
  double   input;
};

/* a list of {result, input} */
static const TableEntry tbl_acos[] = {
  {MATH_PI / 2.0, 0.0},
  {MATH_PI / 2.0, -0.0},
  {0.0, 1.0},
  {MATH_PI, -1.0}
};
static const TableEntry tbl_acosh[] = {
  {0.0, 1.0}
};
static const TableEntry tbl_acospi[] = {
  {0.5, 0.0},
  {0.5, -0.0},
  {0.0, 1.0},
  {1.0, -1.0}
};
static const TableEntry tbl_asin[] = {
  {0.0, 0.0},
  {-0.0, -0.0},
  {MATH_PI / 2.0, 1.0},
  {-MATH_PI / 2.0, -1.0}
};
static const TableEntry tbl_asinh[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_asinpi[] = {
  {0.0, 0.0},
  {-0.0, -0.0},
  {0.5, 1.0},
  {-0.5, -1.0}
};
static const TableEntry tbl_atan[] = {
  {0.0, 0.0},
  {-0.0, -0.0},
  {MATH_PI / 4.0, 1.0},
  {-MATH_PI / 4.0, -1.0}
};
static const TableEntry tbl_atanh[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_atanpi[] = {
  {0.0, 0.0},
  {-0.0, -0.0},
  {0.25, 1.0},
  {-0.25, -1.0}
};
static const TableEntry tbl_cbrt[] = {
  {0.0, 0.0},
  {-0.0, -0.0},
  {1.0, 1.0},
  {-1.0, -1.0},
};
static const TableEntry tbl_cos[] = {
  {1.0, 0.0},
  {1.0, -0.0}
};
static const TableEntry tbl_cosh[] = {
  {1.0, 0.0},
  {1.0, -0.0}
};
static const TableEntry tbl_cospi[] = {
  {1.0, 0.0},
  {1.0, -0.0}
};
static const TableEntry tbl_erfc[] = {
  {1.0, 0.0},
  {1.0, -0.0}
};
static const TableEntry tbl_erf[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_exp[] = {
  {1.0, 0.0},
  {1.0, -0.0},
  {MATH_E, 1.0}
};
static const TableEntry tbl_exp2[] = {
  {1.0, 0.0},
  {1.0, -0.0},
  {2.0, 1.0}
};
static const TableEntry tbl_exp10[] = {
  {1.0, 0.0},
  {1.0, -0.0},
  {10.0, 1.0}
};
static const TableEntry tbl_expm1[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_log[] = {
  {0.0, 1.0},
  {1.0, MATH_E}
};
static const TableEntry tbl_log2[] = {
  {0.0, 1.0},
  {1.0, 2.0}
};
static const TableEntry tbl_log10[] = {
  {0.0, 1.0},
  {1.0, 10.0}
};
static const TableEntry tbl_rsqrt[] = {
  {1.0, 1.0},
  {MATH_SQRT1_2, 2.0}
};
static const TableEntry tbl_sin[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_sinh[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_sinpi[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_sqrt[] = {
  {0.0, 0.0},
  {1.0, 1.0},
  {MATH_SQRT2, 2.0}
};
static const TableEntry tbl_tan[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_tanh[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_tanpi[] = {
  {0.0, 0.0},
  {-0.0, -0.0}
};
static const TableEntry tbl_tgamma[] = {
  {1.0, 1.0},
  {1.0, 2.0},
  {2.0, 3.0},
  {6.0, 4.0}
};

static bool HasNative(AMDGPULibFunc::EFuncId id) {
  switch(id) {
  case AMDGPULibFunc::EI_DIVIDE:
  case AMDGPULibFunc::EI_COS:
  case AMDGPULibFunc::EI_EXP:
  case AMDGPULibFunc::EI_EXP2:
  case AMDGPULibFunc::EI_EXP10:
  case AMDGPULibFunc::EI_LOG:
  case AMDGPULibFunc::EI_LOG2:
  case AMDGPULibFunc::EI_LOG10:
  case AMDGPULibFunc::EI_POWR:
  case AMDGPULibFunc::EI_RECIP:
  case AMDGPULibFunc::EI_RSQRT:
  case AMDGPULibFunc::EI_SIN:
  case AMDGPULibFunc::EI_SINCOS:
  case AMDGPULibFunc::EI_SQRT:
  case AMDGPULibFunc::EI_TAN:
    return true;
  default:;
  }
  return false;
}

using TableRef = ArrayRef<TableEntry>;

static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
  switch(id) {
  case AMDGPULibFunc::EI_ACOS:    return TableRef(tbl_acos);
  case AMDGPULibFunc::EI_ACOSH:   return TableRef(tbl_acosh);
  case AMDGPULibFunc::EI_ACOSPI:  return TableRef(tbl_acospi);
  case AMDGPULibFunc::EI_ASIN:    return TableRef(tbl_asin);
  case AMDGPULibFunc::EI_ASINH:   return TableRef(tbl_asinh);
  case AMDGPULibFunc::EI_ASINPI:  return TableRef(tbl_asinpi);
  case AMDGPULibFunc::EI_ATAN:    return TableRef(tbl_atan);
  case AMDGPULibFunc::EI_ATANH:   return TableRef(tbl_atanh);
  case AMDGPULibFunc::EI_ATANPI:  return TableRef(tbl_atanpi);
  case AMDGPULibFunc::EI_CBRT:    return TableRef(tbl_cbrt);
  case AMDGPULibFunc::EI_NCOS:
  case AMDGPULibFunc::EI_COS:     return TableRef(tbl_cos);
  case AMDGPULibFunc::EI_COSH:    return TableRef(tbl_cosh);
  case AMDGPULibFunc::EI_COSPI:   return TableRef(tbl_cospi);
  case AMDGPULibFunc::EI_ERFC:    return TableRef(tbl_erfc);
  case AMDGPULibFunc::EI_ERF:     return TableRef(tbl_erf);
  case AMDGPULibFunc::EI_EXP:     return TableRef(tbl_exp);
  case AMDGPULibFunc::EI_NEXP2:
  case AMDGPULibFunc::EI_EXP2:    return TableRef(tbl_exp2);
  case AMDGPULibFunc::EI_EXP10:   return TableRef(tbl_exp10);
  case AMDGPULibFunc::EI_EXPM1:   return TableRef(tbl_expm1);
  case AMDGPULibFunc::EI_LOG:     return TableRef(tbl_log);
  case AMDGPULibFunc::EI_NLOG2:
  case AMDGPULibFunc::EI_LOG2:    return TableRef(tbl_log2);
  case AMDGPULibFunc::EI_LOG10:   return TableRef(tbl_log10);
  case AMDGPULibFunc::EI_NRSQRT:
  case AMDGPULibFunc::EI_RSQRT:   return TableRef(tbl_rsqrt);
  case AMDGPULibFunc::EI_NSIN:
  case AMDGPULibFunc::EI_SIN:     return TableRef(tbl_sin);
  case AMDGPULibFunc::EI_SINH:    return TableRef(tbl_sinh);
  case AMDGPULibFunc::EI_SINPI:   return TableRef(tbl_sinpi);
  case AMDGPULibFunc::EI_NSQRT:
  case AMDGPULibFunc::EI_SQRT:    return TableRef(tbl_sqrt);
  case AMDGPULibFunc::EI_TAN:     return TableRef(tbl_tan);
  case AMDGPULibFunc::EI_TANH:    return TableRef(tbl_tanh);
  case AMDGPULibFunc::EI_TANPI:   return TableRef(tbl_tanpi);
  case AMDGPULibFunc::EI_TGAMMA:  return TableRef(tbl_tgamma);
  default:;
  }
  return TableRef();
}

static inline int getVecSize(const AMDGPULibFunc& FInfo) {
  return FInfo.getLeads()[0].VectorSize;
}

static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
  return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
}

FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
  // If we are doing PreLinkOpt, the function is external. So it is safe to
  // use getOrInsertFunction() at this stage.

  return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
                       : AMDGPULibFunc::getFunction(M, fInfo);
}

FunctionCallee AMDGPULibCalls::getFloatFastVariant(
    Module *M, const FuncInfo &fInfo, FuncInfo &newInfo,
    AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
  assert(NewFunc != FastVariant);

  if (FastVariant != AMDGPULibFunc::EI_NONE &&
      getArgType(fInfo) == AMDGPULibFunc::F32) {
    newInfo = AMDGPULibFunc(FastVariant, fInfo);
    if (FunctionCallee NewCallee = getFunction(M, newInfo))
      return NewCallee;
  }

  newInfo = AMDGPULibFunc(NewFunc, fInfo);
  return getFunction(M, newInfo);
}

bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
                                       FuncInfo &FInfo) {
  return AMDGPULibFunc::parse(FMangledName, FInfo);
}

bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
  return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
}

bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
    const FPMathOperator *FPOp) const {
  // TODO: Refine to approxFunc or contract
  return FPOp->isFast();
}

AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
    : SQ(F.getParent()->getDataLayout(),
         &FAM.getResult<TargetLibraryAnalysis>(F),
         FAM.getCachedResult<DominatorTreeAnalysis>(F),
         &FAM.getResult<AssumptionAnalysis>(F)) {}

bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
  return AllNative || llvm::is_contained(UseNative, F);
}

void AMDGPULibCalls::initNativeFuncs() {
  AllNative = useNativeFunc("all") ||
              (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
               UseNative.begin()->empty());
}

bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
  bool native_sin = useNativeFunc("sin");
  bool native_cos = useNativeFunc("cos");

  if (native_sin && native_cos) {
    Module *M = aCI->getModule();
    Value *opr0 = aCI->getArgOperand(0);

    AMDGPULibFunc nf;
    nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
    nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;

    nf.setPrefix(AMDGPULibFunc::NATIVE);
    nf.setId(AMDGPULibFunc::EI_SIN);
    FunctionCallee sinExpr = getFunction(M, nf);

    nf.setPrefix(AMDGPULibFunc::NATIVE);
    nf.setId(AMDGPULibFunc::EI_COS);
    FunctionCallee cosExpr = getFunction(M, nf);
    if (sinExpr && cosExpr) {
      Value *sinval =
          CallInst::Create(sinExpr, opr0, "splitsin", aCI->getIterator());
      Value *cosval =
          CallInst::Create(cosExpr, opr0, "splitcos", aCI->getIterator());
      new StoreInst(cosval, aCI->getArgOperand(1), aCI->getIterator());

      DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
                                          << " with native version of sin/cos");

      replaceCall(aCI, sinval);
      return true;
    }
  }
  return false;
}

bool AMDGPULibCalls::useNative(CallInst *aCI) {
  Function *Callee = aCI->getCalledFunction();
  if (!Callee || aCI->isNoBuiltin())
    return false;

  FuncInfo FInfo;
  if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() ||
      FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
      getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
      !(AllNative || useNativeFunc(FInfo.getName()))) {
    return false;
  }

  if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
    return sincosUseNative(aCI, FInfo);

  FInfo.setPrefix(AMDGPULibFunc::NATIVE);
  FunctionCallee F = getFunction(aCI->getModule(), FInfo);
  if (!F)
    return false;

  aCI->setCalledFunction(F);
  DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
                                      << " with native version");
  return true;
}

// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
// builtin, with appended type size and alignment arguments, where 2 or 4
// indicates the original number of arguments. The library has optimized version
// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
                                          const FuncInfo &FInfo) {
  auto *Callee = CI->getCalledFunction();
  if (!Callee->isDeclaration())
    return false;

  assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
  auto *M = Callee->getParent();
  std::string Name = std::string(Callee->getName());
  auto NumArg = CI->arg_size();
  if (NumArg != 4 && NumArg != 6)
    return false;
  ConstantInt *PacketSize =
      dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 2));
  ConstantInt *PacketAlign =
      dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 1));
  if (!PacketSize || !PacketAlign)
    return false;

  unsigned Size = PacketSize->getZExtValue();
  Align Alignment = PacketAlign->getAlignValue();
  if (Alignment != Size)
    return false;

  unsigned PtrArgLoc = CI->arg_size() - 3;
  Value *PtrArg = CI->getArgOperand(PtrArgLoc);
  Type *PtrTy = PtrArg->getType();

  SmallVector<llvm::Type *, 6> ArgTys;
  for (unsigned I = 0; I != PtrArgLoc; ++I)
    ArgTys.push_back(CI->getArgOperand(I)->getType());
  ArgTys.push_back(PtrTy);

  Name = Name + "_" + std::to_string(Size);
  auto *FTy = FunctionType::get(Callee->getReturnType(),
                                ArrayRef<Type *>(ArgTys), false);
  AMDGPULibFunc NewLibFunc(Name, FTy);
  FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
  if (!F)
    return false;

  SmallVector<Value *, 6> Args;
  for (unsigned I = 0; I != PtrArgLoc; ++I)
    Args.push_back(CI->getArgOperand(I));
  Args.push_back(PtrArg);

  auto *NCI = B.CreateCall(F, Args);
  NCI->setAttributes(CI->getAttributes());
  CI->replaceAllUsesWith(NCI);
  CI->dropAllReferences();
  CI->eraseFromParent();

  return true;
}

// This function returns false if no change; return true otherwise.
bool AMDGPULibCalls::fold(CallInst *CI) {
  Function *Callee = CI->getCalledFunction();
  // Ignore indirect calls.
  if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
    return false;

  FuncInfo FInfo;
  if (!parseFunctionName(Callee->getName(), FInfo))
    return false;

  // Further check the number of arguments to see if they match.
  // TODO: Check calling convention matches too
  if (!FInfo.isCompatibleSignature(*Callee->getParent(), CI->getFunctionType()))
    return false;

  LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');

  if (TDOFold(CI, FInfo))
    return true;

  IRBuilder<> B(CI);
  if (CI->isStrictFP())
    B.setIsFPConstrained(true);

  if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
    // Under unsafe-math, evaluate calls if possible.
    // According to Brian Sumner, we can do this for all f32 function calls
    // using host's double function calls.
    if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(CI, FInfo))
      return true;

    // Copy fast flags from the original call.
    FastMathFlags FMF = FPOp->getFastMathFlags();
    B.setFastMathFlags(FMF);

    // Specialized optimizations for each function call.
    //
    // TODO: Handle native functions
    switch (FInfo.getId()) {
    case AMDGPULibFunc::EI_EXP:
      if (FMF.none())
        return false;
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp,
                                                  FMF.approxFunc());
    case AMDGPULibFunc::EI_EXP2:
      if (FMF.none())
        return false;
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp2,
                                                  FMF.approxFunc());
    case AMDGPULibFunc::EI_LOG:
      if (FMF.none())
        return false;
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log,
                                                  FMF.approxFunc());
    case AMDGPULibFunc::EI_LOG2:
      if (FMF.none())
        return false;
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log2,
                                                  FMF.approxFunc());
    case AMDGPULibFunc::EI_LOG10:
      if (FMF.none())
        return false;
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log10,
                                                  FMF.approxFunc());
    case AMDGPULibFunc::EI_FMIN:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::minnum,
                                                  true, true);
    case AMDGPULibFunc::EI_FMAX:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::maxnum,
                                                  true, true);
    case AMDGPULibFunc::EI_FMA:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fma, true,
                                                  true);
    case AMDGPULibFunc::EI_MAD:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fmuladd,
                                                  true, true);
    case AMDGPULibFunc::EI_FABS:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fabs, true,
                                                  true, true);
    case AMDGPULibFunc::EI_COPYSIGN:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::copysign,
                                                  true, true, true);
    case AMDGPULibFunc::EI_FLOOR:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::floor, true,
                                                  true);
    case AMDGPULibFunc::EI_CEIL:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::ceil, true,
                                                  true);
    case AMDGPULibFunc::EI_TRUNC:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::trunc, true,
                                                  true);
    case AMDGPULibFunc::EI_RINT:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::rint, true,
                                                  true);
    case AMDGPULibFunc::EI_ROUND:
      return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::round, true,
                                                  true);
    case AMDGPULibFunc::EI_LDEXP: {
      if (!shouldReplaceLibcallWithIntrinsic(CI, true, true))
        return false;

      Value *Arg1 = CI->getArgOperand(1);
      if (VectorType *VecTy = dyn_cast<VectorType>(CI->getType());
          VecTy && !isa<VectorType>(Arg1->getType())) {
        Value *SplatArg1 = B.CreateVectorSplat(VecTy->getElementCount(), Arg1);
        CI->setArgOperand(1, SplatArg1);
      }

      CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
          CI->getModule(), Intrinsic::ldexp,
          {CI->getType(), CI->getArgOperand(1)->getType()}));
      return true;
    }
    case AMDGPULibFunc::EI_POW:
    case AMDGPULibFunc::EI_POW_FAST:
      return tryOptimizePow(FPOp, B, FInfo);
    case AMDGPULibFunc::EI_POWR:
    case AMDGPULibFunc::EI_POWR_FAST: {
      if (fold_pow(FPOp, B, FInfo))
        return true;
      if (!FMF.approxFunc())
        return false;

      if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
          getArgType(FInfo) == AMDGPULibFunc::F32) {
        Module *M = Callee->getParent();
        AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
        if (FunctionCallee PowrFastFunc = getFunction(M, PowrFastInfo)) {
          CI->setCalledFunction(PowrFastFunc);
          return true;
        }
      }

      if (!shouldReplaceLibcallWithIntrinsic(CI))
        return false;
      return expandFastPow(FPOp, B, PowKind::PowR);
    }
    case AMDGPULibFunc::EI_POWN:
    case AMDGPULibFunc::EI_POWN_FAST: {
      if (fold_pow(FPOp, B, FInfo))
        return true;
      if (!FMF.approxFunc())
        return false;

      if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
          getArgType(FInfo) == AMDGPULibFunc::F32) {
        Module *M = Callee->getParent();
        AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
        if (FunctionCallee PownFastFunc = getFunction(M, PownFastInfo)) {
          CI->setCalledFunction(PownFastFunc);
          return true;
        }
      }

      if (!shouldReplaceLibcallWithIntrinsic(CI))
        return false;
      return expandFastPow(FPOp, B, PowKind::PowN);
    }
    case AMDGPULibFunc::EI_ROOTN:
    case AMDGPULibFunc::EI_ROOTN_FAST: {
      if (fold_rootn(FPOp, B, FInfo))
        return true;
      if (!FMF.approxFunc())
        return false;

      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
        Module *M = Callee->getParent();
        AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
        if (FunctionCallee RootnFastFunc = getFunction(M, RootnFastInfo)) {
          CI->setCalledFunction(RootnFastFunc);
          return true;
        }
      }

      return expandFastPow(FPOp, B, PowKind::RootN);
    }
    case AMDGPULibFunc::EI_SQRT:
      // TODO: Allow with strictfp + constrained intrinsic
      return tryReplaceLibcallWithSimpleIntrinsic(
          B, CI, Intrinsic::sqrt, true, true, /*AllowStrictFP=*/false);
    case AMDGPULibFunc::EI_COS:
    case AMDGPULibFunc::EI_SIN:
      return fold_sincos(FPOp, B, FInfo);
    default:
      break;
    }
  } else {
    // Specialized optimizations for each function call
    switch (FInfo.getId()) {
    case AMDGPULibFunc::EI_READ_PIPE_2:
    case AMDGPULibFunc::EI_READ_PIPE_4:
    case AMDGPULibFunc::EI_WRITE_PIPE_2:
    case AMDGPULibFunc::EI_WRITE_PIPE_4:
      return fold_read_write_pipe(CI, B, FInfo);
    default:
      break;
    }
  }

  return false;
}

static Constant *getConstantFloatVector(const ArrayRef<APFloat> Values,
                                        const Type *Ty) {
  Type *ElemTy = Ty->getScalarType();
  const fltSemantics &FltSem = ElemTy->getFltSemantics();

  SmallVector<Constant *, 4> ConstValues;
  ConstValues.reserve(Values.size());
  for (APFloat APF : Values) {
    bool Unused;
    APF.convert(FltSem, APFloat::rmNearestTiesToEven, &Unused);
    ConstValues.push_back(ConstantFP::get(ElemTy, APF));
  }
  return ConstantVector::get(ConstValues);
}

bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
  // Table-Driven optimization
  const TableRef tr = getOptTable(FInfo.getId());
  if (tr.empty())
    return false;

  int const sz = (int)tr.size();
  Value *opr0 = CI->getArgOperand(0);

  int vecSize = getVecSize(FInfo);
  if (vecSize > 1) {
    // Vector version
    Constant *CV = dyn_cast<Constant>(opr0);
    if (CV && CV->getType()->isVectorTy()) {
      SmallVector<APFloat, 4> Values;
      Values.reserve(vecSize);
      for (int eltNo = 0; eltNo < vecSize; ++eltNo) {
        ConstantFP *eltval =
            cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
        auto MatchingRow = llvm::find_if(tr, [eltval](const TableEntry &entry) {
          return eltval->isExactlyValue(entry.input);
        });
        if (MatchingRow == tr.end())
          return false;
        Values.push_back(APFloat(MatchingRow->result));
      }
      Constant *NewValues = getConstantFloatVector(Values, CI->getType());
      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
      replaceCall(CI, NewValues);
      return true;
    }
  } else {
    // Scalar version
    if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
      for (int i = 0; i < sz; ++i) {
        if (CF->isExactlyValue(tr[i].input)) {
          Value *nval = ConstantFP::get(CF->getType(), tr[i].result);
          LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
          replaceCall(CI, nval);
          return true;
        }
      }
    }
  }

  return false;
}

namespace llvm {
static double log2(double V) {
#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
  return ::log2(V);
#else
  return log(V) / numbers::ln2;
#endif
}
} // namespace llvm

bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
                              const FuncInfo &FInfo) {
  assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
          FInfo.getId() == AMDGPULibFunc::EI_POW_FAST ||
          FInfo.getId() == AMDGPULibFunc::EI_POWR ||
          FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST ||
          FInfo.getId() == AMDGPULibFunc::EI_POWN ||
          FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
         "fold_pow: encounter a wrong function call");

  Module *M = B.GetInsertBlock()->getModule();
  Type *eltType = FPOp->getType()->getScalarType();
  Value *opr0 = FPOp->getOperand(0);
  Value *opr1 = FPOp->getOperand(1);

  const APFloat *CF = nullptr;
  const APInt *CINT = nullptr;
  if (!match(opr1, m_APFloatAllowPoison(CF)))
    match(opr1, m_APIntAllowPoison(CINT));

  // 0x1111111 means that we don't do anything for this call.
  int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);

  if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
    //  pow/powr/pown(x, 0) == 1
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
    Constant *cnval = ConstantFP::get(eltType, 1.0);
    if (getVecSize(FInfo) > 1) {
      cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
    }
    replaceCall(FPOp, cnval);
    return true;
  }
  if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
    // pow/powr/pown(x, 1.0) = x
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
    replaceCall(FPOp, opr0);
    return true;
  }
  if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
    // pow/powr/pown(x, 2.0) = x*x
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
                      << *opr0 << "\n");
    Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
    replaceCall(FPOp, nval);
    return true;
  }
  if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
    // pow/powr/pown(x, -1.0) = 1.0/x
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
    Constant *cnval = ConstantFP::get(eltType, 1.0);
    if (getVecSize(FInfo) > 1) {
      cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
    }
    Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
    replaceCall(FPOp, nval);
    return true;
  }

  if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
    // pow[r](x, [-]0.5) = sqrt(x)
    bool issqrt = CF->isExactlyValue(0.5);
    if (FunctionCallee FPExpr =
            getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
                                                : AMDGPULibFunc::EI_RSQRT,
                                         FInfo))) {
      LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
                        << '(' << *opr0 << ")\n");
      Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
                                                        : "__pow2rsqrt");
      replaceCall(FPOp, nval);
      return true;
    }
  }

  if (!isUnsafeFiniteOnlyMath(FPOp))
    return false;

  // Unsafe Math optimization

  // Remember that ci_opr1 is set if opr1 is integral
  if (CF) {
    double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
                      ? (double)CF->convertToFloat()
                      : CF->convertToDouble();
    int ival = (int)dval;
    if ((double)ival == dval) {
      ci_opr1 = ival;
    } else
      ci_opr1 = 0x11111111;
  }

  // pow/powr/pown(x, c) = [1/](x*x*..x); where
  //   trunc(c) == c && the number of x == c && |c| <= 12
  unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
  if (abs_opr1 <= 12) {
    Constant *cnval;
    Value *nval;
    if (abs_opr1 == 0) {
      cnval = ConstantFP::get(eltType, 1.0);
      if (getVecSize(FInfo) > 1) {
        cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
      }
      nval = cnval;
    } else {
      Value *valx2 = nullptr;
      nval = nullptr;
      while (abs_opr1 > 0) {
        valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0;
        if (abs_opr1 & 1) {
          nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2;
        }
        abs_opr1 >>= 1;
      }
    }

    if (ci_opr1 < 0) {
      cnval = ConstantFP::get(eltType, 1.0);
      if (getVecSize(FInfo) > 1) {
        cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
      }
      nval = B.CreateFDiv(cnval, nval, "__1powprod");
    }
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
                      << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
                      << ")\n");
    replaceCall(FPOp, nval);
    return true;
  }

  // If we should use the generic intrinsic instead of emitting a libcall
  const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();

  // powr ---> exp2(y * log2(x))
  // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
  FunctionCallee ExpExpr;
  if (ShouldUseIntrinsic)
    ExpExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::exp2,
                                                {FPOp->getType()});
  else {
    ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
    if (!ExpExpr)
      return false;
  }

  bool needlog = false;
  bool needabs = false;
  bool needcopysign = false;
  Constant *cnval = nullptr;
  if (getVecSize(FInfo) == 1) {
    CF = nullptr;
    match(opr0, m_APFloatAllowPoison(CF));

    if (CF) {
      double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
                     ? (double)CF->convertToFloat()
                     : CF->convertToDouble();

      V = log2(std::abs(V));
      cnval = ConstantFP::get(eltType, V);
      needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
                      FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
                     CF->isNegative();
    } else {
      needlog = true;
      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
                               FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
    }
  } else {
    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);

    if (!CDV) {
      needlog = true;
      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
                               FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
    } else {
      assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
              "Wrong vector size detected");

      SmallVector<double, 0> DVal;
      for (int i=0; i < getVecSize(FInfo); ++i) {
        double V = CDV->getElementAsAPFloat(i).convertToDouble();
        if (V < 0.0) needcopysign = true;
        V = log2(std::abs(V));
        DVal.push_back(V);
      }
      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
        SmallVector<float, 0> FVal;
        for (double D : DVal)
          FVal.push_back((float)D);
        ArrayRef<float> tmp(FVal);
        cnval = ConstantDataVector::get(M->getContext(), tmp);
      } else {
        ArrayRef<double> tmp(DVal);
        cnval = ConstantDataVector::get(M->getContext(), tmp);
      }
    }
  }

  if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW ||
                       FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
    // We cannot handle corner cases for a general pow() function, give up
    // unless y is a constant integral value. Then proceed as if it were pown.
    if (!isKnownIntegral(opr1, SQ.getWithInstruction(cast<Instruction>(FPOp)),
                         FPOp->getFastMathFlags()))
      return false;
  }

  Value *nval;
  if (needabs) {
    nval = B.CreateUnaryIntrinsic(Intrinsic::fabs, opr0, nullptr, "__fabs");
  } else {
    nval = cnval ? cnval : opr0;
  }
  if (needlog) {
    FunctionCallee LogExpr;
    if (ShouldUseIntrinsic) {
      LogExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::log2,
                                                  {FPOp->getType()});
    } else {
      LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
      if (!LogExpr)
        return false;
    }

    nval = CreateCallEx(B,LogExpr, nval, "__log2");
  }

  if (FInfo.getId() == AMDGPULibFunc::EI_POWN ||
      FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
    // convert int(32) to fp(f32 or f64)
    opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
  }
  nval = B.CreateFMul(opr1, nval, "__ylogx");

  CallInst *Exp2Call = CreateCallEx(B, ExpExpr, nval, "__exp2");

  // TODO: Generalized fpclass logic for pow
  FPClassTest KnownNot = FPClassTest::fcNegative;
  if (FPOp->hasNoNaNs())
    KnownNot |= FPClassTest::fcNan;

  Exp2Call->addRetAttr(
      Attribute::getWithNoFPClass(Exp2Call->getContext(), KnownNot));
  nval = Exp2Call;

  if (needcopysign) {
    Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
    Type *nTy = FPOp->getType()->getWithNewType(nTyS);
    Value *opr_n = FPOp->getOperand(1);
    if (opr_n->getType()->getScalarType()->isIntegerTy())
      opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
    else
      opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");

    unsigned size = nTy->getScalarSizeInBits();
    Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
    sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");

    nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
                            nullptr, "__pow_sign");
  }

  LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
                    << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
  replaceCall(FPOp, nval);

  return true;
}

bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
                                const FuncInfo &FInfo) {
  Value *opr0 = FPOp->getOperand(0);
  Value *opr1 = FPOp->getOperand(1);

  const APInt *CINT = nullptr;
  if (!match(opr1, m_APIntAllowPoison(CINT)))
    return false;

  Function *Parent = B.GetInsertBlock()->getParent();

  int ci_opr1 = (int)CINT->getSExtValue();
  if (ci_opr1 == 1 && !Parent->hasFnAttribute(Attribute::StrictFP)) {
    // rootn(x, 1) = x
    //
    // TODO: Insert constrained canonicalize for strictfp case.
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
    replaceCall(FPOp, opr0);
    return true;
  }

  Module *M = B.GetInsertBlock()->getModule();

  CallInst *CI = cast<CallInst>(FPOp);
  if (ci_opr1 == 2 &&
      shouldReplaceLibcallWithIntrinsic(CI,
                                        /*AllowMinSizeF32=*/true,
                                        /*AllowF64=*/true)) {
    // rootn(x, 2) = sqrt(x)
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");

    CallInst *NewCall = B.CreateUnaryIntrinsic(Intrinsic::sqrt, opr0, CI);
    NewCall->takeName(CI);

    // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
    // metadata.
    MDBuilder MDHelper(M->getContext());
    MDNode *FPMD = MDHelper.createFPMath(std::max(FPOp->getFPAccuracy(), 2.0f));
    NewCall->setMetadata(LLVMContext::MD_fpmath, FPMD);

    replaceCall(CI, NewCall);
    return true;
  }

  if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
    if (FunctionCallee FPExpr =
            getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
      LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
                        << ")\n");
      Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
      replaceCall(FPOp, nval);
      return true;
    }
  } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
    Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
                               opr0,
                               "__rootn2div");
    replaceCall(FPOp, nval);
    return true;
  }

  if (ci_opr1 == -2 &&
      shouldReplaceLibcallWithIntrinsic(CI,
                                        /*AllowMinSizeF32=*/true,
                                        /*AllowF64=*/true)) {
    // rootn(x, -2) = rsqrt(x)

    // The original rootn had looser ulp requirements than the resultant sqrt
    // and fdiv.
    MDBuilder MDHelper(M->getContext());
    MDNode *FPMD = MDHelper.createFPMath(std::max(FPOp->getFPAccuracy(), 2.0f));

    // TODO: Could handle strictfp but need to fix strict sqrt emission
    FastMathFlags FMF = FPOp->getFastMathFlags();
    FMF.setAllowContract(true);

    CallInst *Sqrt = B.CreateUnaryIntrinsic(Intrinsic::sqrt, opr0, CI);
    Instruction *RSqrt = cast<Instruction>(
        B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), Sqrt));
    Sqrt->setFastMathFlags(FMF);
    RSqrt->setFastMathFlags(FMF);
    RSqrt->setMetadata(LLVMContext::MD_fpmath, FPMD);

    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
                      << ")\n");
    replaceCall(CI, RSqrt);
    return true;
  }

  return false;
}

// is_integer(y) => trunc(y) == y
static Value *emitIsInteger(IRBuilder<> &B, Value *Y) {
  Value *TruncY = B.CreateUnaryIntrinsic(Intrinsic::trunc, Y);
  return B.CreateFCmpOEQ(TruncY, Y);
}

static Value *emitIsEvenInteger(IRBuilder<> &B, Value *Y) {
  // Even integers are still integers after division by 2.
  auto *HalfY = B.CreateFMul(Y, ConstantFP::get(Y->getType(), 0.5));
  return emitIsInteger(B, HalfY);
}

// is_odd_integer(y) => is_integer(y) && !is_even_integer(y)
static Value *emitIsOddInteger(IRBuilder<> &B, Value *Y) {
  Value *IsIntY = emitIsInteger(B, Y);
  Value *IsEvenY = emitIsEvenInteger(B, Y);
  Value *NotEvenY = B.CreateNot(IsEvenY);
  return B.CreateAnd(IsIntY, NotEvenY);
}

// isinf(val) => fabs(val) == +inf
static Value *emitIsInf(IRBuilder<> &B, Value *val) {
  auto *fabsVal = B.CreateUnaryIntrinsic(Intrinsic::fabs, val);
  return B.CreateFCmpOEQ(fabsVal, ConstantFP::getInfinity(val->getType()));
}

// y * log2(fabs(x))
static Value *emitFastExpYLnx(IRBuilder<> &B, Value *X, Value *Y) {
  Value *AbsX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
  Value *LogAbsX = B.CreateUnaryIntrinsic(Intrinsic::log2, AbsX);
  Value *YTimesLogX = B.CreateFMul(Y, LogAbsX);
  return B.CreateUnaryIntrinsic(Intrinsic::exp2, YTimesLogX);
}

/// Emit special case management epilog code for fast pow, powr, pown, and rootn
/// expansions. \p x and \p y should be the arguments to the library call
/// (possibly with some values clamped). \p expylnx should be the result to use
/// in normal circumstances.
static Value *emitPowFixup(IRBuilder<> &B, Value *X, Value *Y, Value *ExpYLnX,
                           PowKind Kind) {
  Constant *Zero = ConstantFP::getZero(X->getType());
  Constant *One = ConstantFP::get(X->getType(), 1.0);
  Constant *QNaN = ConstantFP::getQNaN(X->getType());
  Constant *PInf = ConstantFP::getInfinity(X->getType());

  switch (Kind) {
  case PowKind::Pow: {
    // is_odd_integer(y)
    Value *IsOddY = emitIsOddInteger(B, Y);

    // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
    Value *SelSign = B.CreateSelect(IsOddY, X, One);
    Value *Ret = B.CreateCopySign(ExpYLnX, SelSign);

    // if (x < 0 && !is_integer(y)) ret = QNAN
    Value *IsIntY = emitIsInteger(B, Y);
    Value *condNegX = B.CreateFCmpOLT(X, Zero);
    Value *condNotIntY = B.CreateNot(IsIntY);
    Value *condNaN = B.CreateAnd(condNegX, condNotIntY);
    Ret = B.CreateSelect(condNaN, QNaN, Ret);

    // if (isinf(ay)) { ... }

    // FIXME: Missing backend optimization to save on materialization cost of
    // mixed sign constant infinities.
    Value *YIsInf = emitIsInf(B, Y);

    Value *AY = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y);
    Value *YIsNegInf = B.CreateFCmpUNE(Y, AY);

    Value *AX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
    Value *AxEqOne = B.CreateFCmpOEQ(AX, One);
    Value *AxLtOne = B.CreateFCmpOLT(AX, One);
    Value *XorCond = B.CreateXor(AxLtOne, YIsNegInf);
    Value *SelInf =
        B.CreateSelect(AxEqOne, AX, B.CreateSelect(XorCond, Zero, AY));
    Ret = B.CreateSelect(YIsInf, SelInf, Ret);

    // if (isinf(ax) || x == 0.0f) { ... }
    Value *XIsInf = emitIsInf(B, X);
    Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
    Value *AxInfOrZero = B.CreateOr(XIsInf, XEqZero);
    Value *YLtZero = B.CreateFCmpOLT(Y, Zero);
    Value *XorZeroInf = B.CreateXor(XEqZero, YLtZero);
    Value *SelVal = B.CreateSelect(XorZeroInf, Zero, PInf);
    Value *SelSign2 = B.CreateSelect(IsOddY, X, Zero);
    Value *Copysign = B.CreateCopySign(SelVal, SelSign2);
    Ret = B.CreateSelect(AxInfOrZero, Copysign, Ret);

    // if (isunordered(x, y)) ret = QNAN
    Value *isUnordered = B.CreateFCmpUNO(X, Y);
    return B.CreateSelect(isUnordered, QNaN, Ret);
  }
  case PowKind::PowR: {
    Value *YIsNeg = B.CreateFCmpOLT(Y, Zero);
    Value *IZ = B.CreateSelect(YIsNeg, PInf, Zero);
    Value *ZI = B.CreateSelect(YIsNeg, Zero, PInf);

    Value *YEqZero = B.CreateFCmpOEQ(Y, Zero);
    Value *SelZeroCase = B.CreateSelect(YEqZero, QNaN, IZ);
    Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
    Value *Ret = B.CreateSelect(XEqZero, SelZeroCase, ExpYLnX);

    Value *XEqInf = B.CreateFCmpOEQ(X, PInf);
    Value *YNeZero = B.CreateFCmpUNE(Y, Zero);
    Value *CondInfCase = B.CreateAnd(XEqInf, YNeZero);
    Ret = B.CreateSelect(CondInfCase, ZI, Ret);

    Value *IsInfY = emitIsInf(B, Y);
    Value *XNeOne = B.CreateFCmpUNE(X, One);
    Value *CondInfY = B.CreateAnd(IsInfY, XNeOne);
    Value *XLtOne = B.CreateFCmpOLT(X, One);
    Value *SelInfYCase = B.CreateSelect(XLtOne, IZ, ZI);
    Ret = B.CreateSelect(CondInfY, SelInfYCase, Ret);

    Value *IsUnordered = B.CreateFCmpUNO(X, Y);
    return B.CreateSelect(IsUnordered, QNaN, Ret);
  }
  case PowKind::PowN: {
    Constant *ZeroI = ConstantInt::get(Y->getType(), 0);

    // is_odd_y = (ny & 1) != 0
    Value *OneI = ConstantInt::get(Y->getType(), 1);
    Value *YAnd1 = B.CreateAnd(Y, OneI);
    Value *IsOddY = B.CreateICmpNE(YAnd1, ZeroI);

    // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
    Value *SelSign = B.CreateSelect(IsOddY, X, One);
    Value *Ret = B.CreateCopySign(ExpYLnX, SelSign);

    // if (isinf(x) || x == 0.0f)
    Value *FabsX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
    Value *XIsInf = B.CreateFCmpOEQ(FabsX, PInf);
    Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
    Value *InfOrZero = B.CreateOr(XIsInf, XEqZero);

    // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
    Value *YLtZero = B.CreateICmpSLT(Y, ZeroI);
    Value *XorZeroInf = B.CreateXor(XEqZero, YLtZero);
    Value *SelVal = B.CreateSelect(XorZeroInf, Zero, PInf);

    // copysign(selVal, is_odd_y ? x : 0.0f)
    Value *SelSign2 = B.CreateSelect(IsOddY, X, Zero);
    Value *Copysign = B.CreateCopySign(SelVal, SelSign2);

    return B.CreateSelect(InfOrZero, Copysign, Ret);
  }
  case PowKind::RootN: {
    Constant *ZeroI = ConstantInt::get(Y->getType(), 0);

    // is_odd_y = (ny & 1) != 0
    Value *YAnd1 = B.CreateAnd(Y, ConstantInt::get(Y->getType(), 1));
    Value *IsOddY = B.CreateICmpNE(YAnd1, ZeroI);

    // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
    Value *SelSign = B.CreateSelect(IsOddY, X, One);
    Value *Ret = B.CreateCopySign(ExpYLnX, SelSign);

    // if (isinf(x) || x == 0.0f)
    Value *FabsX = B.CreateUnaryIntrinsic(Intrinsic::fabs, X);
    Value *IsInfX = B.CreateFCmpOEQ(FabsX, PInf);
    Value *XEqZero = B.CreateFCmpOEQ(X, Zero);
    Value *CondInfOrZero = B.CreateOr(IsInfX, XEqZero);

    // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
    Value *YLtZero = B.CreateICmpSLT(Y, ZeroI);
    Value *XorZeroInf = B.CreateXor(XEqZero, YLtZero);
    Value *SelVal = B.CreateSelect(XorZeroInf, Zero, PInf);

    // copysign(selVal, is_odd_y ? x : 0.0f)
    Value *SelSign2 = B.CreateSelect(IsOddY, X, Zero);
    Value *Copysign = B.CreateCopySign(SelVal, SelSign2);

    Ret = B.CreateSelect(CondInfOrZero, Copysign, Ret);

    // if ((x < 0.0f && !is_odd_y) || ny == 0) ret = QNAN
    Value *XIsNeg = B.CreateFCmpOLT(X, Zero);
    Value *NotOddY = B.CreateNot(IsOddY);
    Value *CondNegAndNotOdd = B.CreateAnd(XIsNeg, NotOddY);
    Value *YEqZero = B.CreateICmpEQ(Y, ZeroI);
    Value *CondBad = B.CreateOr(CondNegAndNotOdd, YEqZero);
    return B.CreateSelect(CondBad, QNaN, Ret);
  }
  }

  llvm_unreachable("covered switch");
}

// TODO: Move the fold_pow folding to sqrt/fdiv here
bool AMDGPULibCalls::expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B,
                                   PowKind Kind) {
  Type *Ty = FPOp->getType();

  // There's currently no reason to do this for half. The correct path is
  // promote to float and use the fast float expansion.
  //
  // TODO: We could move this expansion to lowering to get half pow to work.
  if (!Ty->getScalarType()->isFloatTy())
    return false;

  // TODO: Verify optimization for double and bfloat.
  Value *X = FPOp->getOperand(0);
  Value *Y = FPOp->getOperand(1);

  switch (Kind) {
  case PowKind::Pow: {
    Constant *One = ConstantFP::get(X->getType(), 1.0);

    // if (x == 1.0f) y = 1.0f;
    Value *XEqOne = B.CreateFCmpOEQ(X, One);
    Y = B.CreateSelect(XEqOne, One, Y);

    // if (y == 0.0f) x = 1.0f;
    Value *YEqZero = B.CreateFCmpOEQ(Y, ConstantFP::getZero(X->getType()));
    X = B.CreateSelect(YEqZero, One, X);

    Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
    Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
    replaceCall(FPOp, Fixed);
    return true;
  }
  case PowKind::PowR: {
    Value *NegX = B.CreateFCmpOLT(X, ConstantFP::getZero(X->getType()));
    X = B.CreateSelect(NegX, ConstantFP::getQNaN(X->getType()), X);

    Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
    Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
    replaceCall(FPOp, Fixed);
    return true;
  }
  case PowKind::PowN: {
    // ny == 0
    Value *YEqZero = B.CreateICmpEQ(Y, ConstantInt::get(Y->getType(), 0));

    // x = (ny == 0 ? 1.0f : x)
    X = B.CreateSelect(YEqZero, ConstantFP::get(X->getType(), 1.0), X);

    Value *CastY = B.CreateSIToFP(Y, X->getType());
    Value *ExpYLnX = emitFastExpYLnx(B, X, CastY);
    Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
    replaceCall(FPOp, Fixed);
    return true;
  }
  case PowKind::RootN: {
    Value *CastY = B.CreateSIToFP(Y, X->getType());
    Value *RcpY = B.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, CastY);
    Value *ExpYLnX = emitFastExpYLnx(B, X, RcpY);
    Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
    replaceCall(FPOp, Fixed);
    return true;
  }
  }
  llvm_unreachable("Unhandled PowKind enum");
}

bool AMDGPULibCalls::tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
                                    const FuncInfo &FInfo) {
  FastMathFlags FMF = FPOp->getFastMathFlags();
  CallInst *Call = cast<CallInst>(FPOp);
  Module *M = Call->getModule();

  FuncInfo PowrInfo;
  AMDGPULibFunc::EFuncId FastPowrFuncId =
      FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
          ? AMDGPULibFunc::EI_POWR_FAST
          : AMDGPULibFunc::EI_NONE;
  FunctionCallee PowrFunc = getFloatFastVariant(
      M, FInfo, PowrInfo, AMDGPULibFunc::EI_POWR, FastPowrFuncId);

  // TODO: Prefer fast pown to fast powr, but slow powr to slow pown.

  // pow(x, y) -> powr(x, y) for x >= -0.0
  // TODO: Account for flags on current call
  if (PowrFunc && cannotBeOrderedLessThanZero(FPOp->getOperand(0),
                                              SQ.getWithInstruction(Call))) {
    Call->setCalledFunction(PowrFunc);
    return fold_pow(FPOp, B, PowrInfo) || true;
  }

  // pow(x, y) -> pown(x, y) for known integral y
  if (isKnownIntegral(FPOp->getOperand(1), SQ.getWithInstruction(Call),
                      FPOp->getFastMathFlags())) {
    FunctionType *PownType = getPownType(Call->getFunctionType());

    FuncInfo PownInfo;
    AMDGPULibFunc::EFuncId FastPownFuncId =
        FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
            ? AMDGPULibFunc::EI_POWN_FAST
            : AMDGPULibFunc::EI_NONE;
    FunctionCallee PownFunc = getFloatFastVariant(
        M, FInfo, PownInfo, AMDGPULibFunc::EI_POWN, FastPownFuncId);

    if (PownFunc) {
      // TODO: If the incoming integral value is an sitofp/uitofp, it won't
      // fold out without a known range. We can probably take the source
      // value directly.
      Value *CastedArg =
          B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1));
      // Have to drop any nofpclass attributes on the original call site.
      Call->removeParamAttrs(
          1, AttributeFuncs::typeIncompatible(CastedArg->getType(),
                                              Call->getParamAttributes(1)));
      Call->setCalledFunction(PownFunc);
      Call->setArgOperand(1, CastedArg);
      return fold_pow(FPOp, B, PownInfo) || true;
    }
  }

  if (fold_pow(FPOp, B, FInfo))
    return true;

  if (!FMF.approxFunc())
    return false;

  if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
      getArgType(FInfo) == AMDGPULibFunc::F32) {
    AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
    if (FunctionCallee PowFastFunc = getFunction(M, PowFastInfo)) {
      Call->setCalledFunction(PowFastFunc);
      return fold_pow(FPOp, B, PowFastInfo) || true;
    }
  }

  return expandFastPow(FPOp, B, PowKind::Pow);
}

// Get a scalar native builtin single argument FP function
FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
                                                 const FuncInfo &FInfo) {
  if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
    return nullptr;
  FuncInfo nf = FInfo;
  nf.setPrefix(AMDGPULibFunc::NATIVE);
  return getFunction(M, nf);
}

// Some library calls are just wrappers around llvm intrinsics, but compiled
// conservatively. Preserve the flags from the original call site by
// substituting them with direct calls with all the flags.
bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
                                                       bool AllowMinSizeF32,
                                                       bool AllowF64,
                                                       bool AllowStrictFP) {
  Type *FltTy = CI->getType()->getScalarType();
  const bool IsF32 = FltTy->isFloatTy();

  // f64 intrinsics aren't implemented for most operations.
  if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
    return false;

  // We're implicitly inlining by replacing the libcall with the intrinsic, so
  // don't do it for noinline call sites.
  if (CI->isNoInline())
    return false;

  const Function *ParentF = CI->getFunction();
  // TODO: Handle strictfp
  if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP))
    return false;

  if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
    return false;
  return true;
}

void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
                                                       CallInst *CI,
                                                       Intrinsic::ID IntrID) {
  if (CI->arg_size() == 2) {
    Value *Arg0 = CI->getArgOperand(0);
    Value *Arg1 = CI->getArgOperand(1);
    VectorType *Arg0VecTy = dyn_cast<VectorType>(Arg0->getType());
    VectorType *Arg1VecTy = dyn_cast<VectorType>(Arg1->getType());
    if (Arg0VecTy && !Arg1VecTy) {
      Value *SplatRHS = B.CreateVectorSplat(Arg0VecTy->getElementCount(), Arg1);
      CI->setArgOperand(1, SplatRHS);
    } else if (!Arg0VecTy && Arg1VecTy) {
      Value *SplatLHS = B.CreateVectorSplat(Arg1VecTy->getElementCount(), Arg0);
      CI->setArgOperand(0, SplatLHS);
    }
  }

  CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
      CI->getModule(), IntrID, {CI->getType()}));
}

bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
    IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
    bool AllowF64, bool AllowStrictFP) {
  if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
                                         AllowStrictFP))
    return false;
  replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
  return true;
}

std::tuple<Value *, Value *, Value *>
AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
                             FunctionCallee Fsincos) {
  DebugLoc DL = B.getCurrentDebugLocation();
  Function *F = B.GetInsertBlock()->getParent();
  B.SetInsertPointPastAllocas(F);

  AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_");

  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
    // If the argument is an instruction, it must dominate all uses so put our
    // sincos call there. Otherwise, right after the allocas works well enough
    // if it's an argument or constant.

    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());

    // SetInsertPoint unwelcomely always tries to set the debug loc.
    B.SetCurrentDebugLocation(DL);
  }

  Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);

  // The allocaInst allocates the memory in private address space. This need
  // to be addrspacecasted to point to the address space of cos pointer type.
  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
  Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);

  CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);

  // TODO: Is it worth trying to preserve the location for the cos calls for the
  // load?

  LoadInst *LoadCos = B.CreateLoad(Arg->getType(), Alloc);
  return {SinCos, LoadCos, SinCos};
}

// fold sin, cos -> sincos.
bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
                                 const FuncInfo &fInfo) {
  assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
         fInfo.getId() == AMDGPULibFunc::EI_COS);

  if ((getArgType(fInfo) != AMDGPULibFunc::F32 &&
       getArgType(fInfo) != AMDGPULibFunc::F64) ||
      fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
    return false;

  bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;

  Value *CArgVal = FPOp->getOperand(0);

  // TODO: Constant fold the call
  if (isa<ConstantData>(CArgVal))
    return false;

  CallInst *CI = cast<CallInst>(FPOp);

  Function *F = B.GetInsertBlock()->getParent();
  Module *M = F->getParent();

  // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
  // implementation. Prefer the private form if available.
  AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
  SinCosLibFuncPrivate.getLeads()[0].PtrKind =
      AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);

  AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
  SinCosLibFuncGeneric.getLeads()[0].PtrKind =
      AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);

  FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
  FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
  FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
  if (!FSinCos)
    return false;

  SmallVector<CallInst *> SinCalls;
  SmallVector<CallInst *> CosCalls;
  SmallVector<CallInst *> SinCosCalls;
  FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
                       fInfo);
  const std::string PairName = PartnerInfo.mangle();

  StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
  StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
  const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
  const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();

  // Intersect the two sets of flags.
  FastMathFlags FMF = FPOp->getFastMathFlags();
  MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath);

  SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};

  for (User* U : CArgVal->users()) {
    CallInst *XI = dyn_cast<CallInst>(U);
    if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
      continue;

    Function *UCallee = XI->getCalledFunction();
    if (!UCallee)
      continue;

    bool Handled = true;

    if (UCallee->getName() == SinName)
      SinCalls.push_back(XI);
    else if (UCallee->getName() == CosName)
      CosCalls.push_back(XI);
    else if (UCallee->getName() == SinCosPrivateName ||
             UCallee->getName() == SinCosGenericName)
      SinCosCalls.push_back(XI);
    else
      Handled = false;

    if (Handled) {
      MergeDbgLocs.push_back(XI->getDebugLoc());
      auto *OtherOp = cast<FPMathOperator>(XI);
      FMF &= OtherOp->getFastMathFlags();
      FPMath = MDNode::getMostGenericFPMath(
          FPMath, XI->getMetadata(LLVMContext::MD_fpmath));
    }
  }

  if (SinCalls.empty() || CosCalls.empty())
    return false;

  B.setFastMathFlags(FMF);
  B.setDefaultFPMathTag(FPMath);
  DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs);
  B.SetCurrentDebugLocation(DbgLoc);

  auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos);

  auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
    for (CallInst *C : Calls)
      C->replaceAllUsesWith(Res);

    // Leave the other dead instructions to avoid clobbering iterators.
  };

  replaceTrigInsts(SinCalls, Sin);
  replaceTrigInsts(CosCalls, Cos);
  replaceTrigInsts(SinCosCalls, SinCos);

  // It's safe to delete the original now.
  CI->eraseFromParent();
  return true;
}

bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
                                            APFloat &Res0, APFloat &Res1,
                                            Constant *copr0, Constant *copr1) {
  // By default, opr0/opr1/opr3 holds values of float/double type.
  // If they are not float/double, each function has to its
  // operand separately.
  double opr0 = 0.0, opr1 = 0.0;
  ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
  ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
  if (fpopr0) {
    opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
             ? fpopr0->getValueAPF().convertToDouble()
             : (double)fpopr0->getValueAPF().convertToFloat();
  }

  if (fpopr1) {
    opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
             ? fpopr1->getValueAPF().convertToDouble()
             : (double)fpopr1->getValueAPF().convertToFloat();
  }

  switch (FInfo.getId()) {
  default:
    return false;

  case AMDGPULibFunc::EI_ACOS:
    Res0 = APFloat{acos(opr0)};
    return true;

  case AMDGPULibFunc::EI_ACOSH:
    // acosh(x) == log(x + sqrt(x*x - 1))
    Res0 = APFloat{log(opr0 + sqrt(opr0 * opr0 - 1.0))};
    return true;

  case AMDGPULibFunc::EI_ACOSPI:
    Res0 = APFloat{acos(opr0) / MATH_PI};
    return true;

  case AMDGPULibFunc::EI_ASIN:
    Res0 = APFloat{asin(opr0)};
    return true;

  case AMDGPULibFunc::EI_ASINH:
    // asinh(x) == log(x + sqrt(x*x + 1))
    Res0 = APFloat{log(opr0 + sqrt(opr0 * opr0 + 1.0))};
    return true;

  case AMDGPULibFunc::EI_ASINPI:
    Res0 = APFloat{asin(opr0) / MATH_PI};
    return true;

  case AMDGPULibFunc::EI_ATAN:
    Res0 = APFloat{atan(opr0)};
    return true;

  case AMDGPULibFunc::EI_ATANH:
    // atanh(x) == (log(x+1) - log(x-1))/2;
    Res0 = APFloat{(log(opr0 + 1.0) - log(opr0 - 1.0)) / 2.0};
    return true;

  case AMDGPULibFunc::EI_ATANPI:
    Res0 = APFloat{atan(opr0) / MATH_PI};
    return true;

  case AMDGPULibFunc::EI_CBRT:
    Res0 =
        APFloat{(opr0 < 0.0) ? -pow(-opr0, 1.0 / 3.0) : pow(opr0, 1.0 / 3.0)};
    return true;

  case AMDGPULibFunc::EI_COS:
    Res0 = APFloat{cos(opr0)};
    return true;

  case AMDGPULibFunc::EI_COSH:
    Res0 = APFloat{cosh(opr0)};
    return true;

  case AMDGPULibFunc::EI_COSPI:
    Res0 = APFloat{cos(MATH_PI * opr0)};
    return true;

  case AMDGPULibFunc::EI_EXP:
    Res0 = APFloat{exp(opr0)};
    return true;

  case AMDGPULibFunc::EI_EXP2:
    Res0 = APFloat{pow(2.0, opr0)};
    return true;

  case AMDGPULibFunc::EI_EXP10:
    Res0 = APFloat{pow(10.0, opr0)};
    return true;

  case AMDGPULibFunc::EI_LOG:
    Res0 = APFloat{log(opr0)};
    return true;

  case AMDGPULibFunc::EI_LOG2:
    Res0 = APFloat{log(opr0) / log(2.0)};
    return true;

  case AMDGPULibFunc::EI_LOG10:
    Res0 = APFloat{log(opr0) / log(10.0)};
    return true;

  case AMDGPULibFunc::EI_RSQRT:
    Res0 = APFloat{1.0 / sqrt(opr0)};
    return true;

  case AMDGPULibFunc::EI_SIN:
    Res0 = APFloat{sin(opr0)};
    return true;

  case AMDGPULibFunc::EI_SINH:
    Res0 = APFloat{sinh(opr0)};
    return true;

  case AMDGPULibFunc::EI_SINPI:
    Res0 = APFloat{sin(MATH_PI * opr0)};
    return true;

  case AMDGPULibFunc::EI_TAN:
    Res0 = APFloat{tan(opr0)};
    return true;

  case AMDGPULibFunc::EI_TANH:
    Res0 = APFloat{tanh(opr0)};
    return true;

  case AMDGPULibFunc::EI_TANPI:
    Res0 = APFloat{tan(MATH_PI * opr0)};
    return true;

  // two-arg functions
  case AMDGPULibFunc::EI_POW:
  case AMDGPULibFunc::EI_POWR:
    Res0 = APFloat{pow(opr0, opr1)};
    return true;

  case AMDGPULibFunc::EI_POWN: {
    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
      double val = (double)iopr1->getSExtValue();
      Res0 = APFloat{pow(opr0, val)};
      return true;
    }
    return false;
  }

  case AMDGPULibFunc::EI_ROOTN: {
    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
      double val = (double)iopr1->getSExtValue();
      Res0 = APFloat{pow(opr0, 1.0 / val)};
      return true;
    }
    return false;
  }

  // with ptr arg
  case AMDGPULibFunc::EI_SINCOS:
    Res0 = APFloat{sin(opr0)};
    Res1 = APFloat{cos(opr0)};
    return true;
  }

  return false;
}

bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
  int numArgs = (int)aCI->arg_size();
  if (numArgs > 3)
    return false;

  Constant *copr0 = nullptr;
  Constant *copr1 = nullptr;
  if (numArgs > 0) {
    if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
      return false;
  }

  if (numArgs > 1) {
    if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) {
      if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
        return false;
    }
  }

  // At this point, all arguments to aCI are constants.

  // max vector size is 16, and sincos will generate two results.
  SmallVector<APFloat, 16> Val0, Val1;
  int FuncVecSize = getVecSize(FInfo);
  bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
  if (FuncVecSize == 1) {
    if (!evaluateScalarMathFunc(FInfo, Val0.emplace_back(0.0),
                                Val1.emplace_back(0.0), copr0, copr1)) {
      return false;
    }
  } else {
    ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
    ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
    for (int i = 0; i < FuncVecSize; ++i) {
      Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
      Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
      if (!evaluateScalarMathFunc(FInfo, Val0.emplace_back(0.0),
                                  Val1.emplace_back(0.0), celt0, celt1)) {
        return false;
      }
    }
  }

  Constant *nval0, *nval1;
  if (FuncVecSize == 1) {
    nval0 = ConstantFP::get(aCI->getType(), Val0[0]);
    if (hasTwoResults)
      nval1 = ConstantFP::get(aCI->getType(), Val1[0]);
  } else {
    nval0 = getConstantFloatVector(Val0, aCI->getType());
    if (hasTwoResults)
      nval1 = getConstantFloatVector(Val1, aCI->getType());
  }

  if (hasTwoResults) {
    // sincos
    assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
           "math function with ptr arg not supported yet");
    new StoreInst(nval1, aCI->getArgOperand(1), aCI->getIterator());
  }

  replaceCall(aCI, nval0);
  return true;
}

PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
  AMDGPULibCalls Simplifier(F, AM);
  Simplifier.initNativeFuncs();

  bool Changed = false;

  LLVM_DEBUG(dbgs() << "AMDIC: process function ";
             F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);

  for (auto &BB : F) {
    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
      // Ignore non-calls.
      CallInst *CI = dyn_cast<CallInst>(I);
      ++I;

      if (CI) {
        if (Simplifier.fold(CI))
          Changed = true;
      }
    }
  }
  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
  if (UseNative.empty())
    return PreservedAnalyses::all();

  AMDGPULibCalls Simplifier(F, AM);
  Simplifier.initNativeFuncs();

  bool Changed = false;
  for (auto &BB : F) {
    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
      // Ignore non-calls.
      CallInst *CI = dyn_cast<CallInst>(I);
      ++I;
      if (CI && Simplifier.useNative(CI))
        Changed = true;
    }
  }
  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}