Home | History | Annotate | Download | only in AMDGPU
      1 //===- AMDGPULibCalls.cpp -------------------------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// This file does AMD library function optimizations.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #define DEBUG_TYPE "amdgpu-simplifylib"
     16 
     17 #include "AMDGPU.h"
     18 #include "AMDGPULibFunc.h"
     19 #include "llvm/Analysis/AliasAnalysis.h"
     20 #include "llvm/Analysis/Loads.h"
     21 #include "llvm/ADT/StringSet.h"
     22 #include "llvm/ADT/StringRef.h"
     23 #include "llvm/IR/Constants.h"
     24 #include "llvm/IR/DerivedTypes.h"
     25 #include "llvm/IR/Instructions.h"
     26 #include "llvm/IR/IRBuilder.h"
     27 #include "llvm/IR/Function.h"
     28 #include "llvm/IR/LLVMContext.h"
     29 #include "llvm/IR/Module.h"
     30 #include "llvm/IR/ValueSymbolTable.h"
     31 #include "llvm/Support/Debug.h"
     32 #include "llvm/Support/raw_ostream.h"
     33 #include "llvm/Target/TargetOptions.h"
     34 #include <vector>
     35 #include <cmath>
     36 
     37 using namespace llvm;
     38 
     39 static cl::opt<bool> EnablePreLink("amdgpu-prelink",
     40   cl::desc("Enable pre-link mode optimizations"),
     41   cl::init(false),
     42   cl::Hidden);
     43 
     44 static cl::list<std::string> UseNative("amdgpu-use-native",
     45   cl::desc("Comma separated list of functions to replace with native, or all"),
     46   cl::CommaSeparated, cl::ValueOptional,
     47   cl::Hidden);
     48 
     49 #define MATH_PI     3.14159265358979323846264338327950288419716939937511
     50 #define MATH_E      2.71828182845904523536028747135266249775724709369996
     51 #define MATH_SQRT2  1.41421356237309504880168872420969807856967187537695
     52 
     53 #define MATH_LOG2E     1.4426950408889634073599246810018921374266459541529859
     54 #define MATH_LOG10E    0.4342944819032518276511289189166050822943970058036665
     55 // Value of log2(10)
     56 #define MATH_LOG2_10   3.3219280948873623478703194294893901758648313930245806
     57 // Value of 1 / log2(10)
     58 #define MATH_RLOG2_10  0.3010299956639811952137388947244930267681898814621085
     59 // Value of 1 / M_LOG2E_F = 1 / log2(e)
     60 #define MATH_RLOG2_E   0.6931471805599453094172321214581765680755001343602552
     61 
     62 namespace llvm {
     63 
     64 class AMDGPULibCalls {
     65 private:
     66 
     67   typedef llvm::AMDGPULibFunc FuncInfo;
     68 
     69   // -fuse-native.
     70   bool AllNative = false;
     71 
     72   bool useNativeFunc(const StringRef F) const;
     73 
     74   // Return a pointer (pointer expr) to the function if function defintion with
     75   // "FuncName" exists. It may create a new function prototype in pre-link mode.
     76   Constant *getFunction(Module *M, const FuncInfo& fInfo);
     77 
     78   // Replace a normal function with its native version.
     79   bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
     80 
     81   bool parseFunctionName(const StringRef& FMangledName,
     82                          FuncInfo *FInfo=nullptr /*out*/);
     83 
     84   bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
     85 
     86   /* Specialized optimizations */
     87 
     88   // recip (half or native)
     89   bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
     90 
     91   // divide (half or native)
     92   bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
     93 
     94   // pow/powr/pown
     95   bool fold_pow(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
     96 
     97   // rootn
     98   bool fold_rootn(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
     99 
    100   // fma/mad
    101   bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    102 
    103   // -fuse-native for sincos
    104   bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
    105 
    106   // evaluate calls if calls' arguments are constants.
    107   bool evaluateScalarMathFunc(FuncInfo &FInfo, double& Res0,
    108     double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
    109   bool evaluateCall(CallInst *aCI, FuncInfo &FInfo);
    110 
    111   // exp
    112   bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    113 
    114   // exp2
    115   bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    116 
    117   // exp10
    118   bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    119 
    120   // log
    121   bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    122 
    123   // log2
    124   bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    125 
    126   // log10
    127   bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    128 
    129   // sqrt
    130   bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
    131 
    132   // sin/cos
    133   bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);
    134 
    135   // __read_pipe/__write_pipe
    136   bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
    137 
    138   // Get insertion point at entry.
    139   BasicBlock::iterator getEntryIns(CallInst * UI);
    140   // Insert an Alloc instruction.
    141   AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
    142   // Get a scalar native builtin signle argument FP function
    143   Constant* getNativeFunction(Module* M, const FuncInfo &FInfo);
    144 
    145 protected:
    146   CallInst *CI;
    147 
    148   bool isUnsafeMath(const CallInst *CI) const;
    149 
    150   void replaceCall(Value *With) {
    151     CI->replaceAllUsesWith(With);
    152     CI->eraseFromParent();
    153   }
    154 
    155 public:
    156   bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
    157 
    158   void initNativeFuncs();
    159 
    160   // Replace a normal math function call with that native version
    161   bool useNative(CallInst *CI);
    162 };
    163 
    164 } // end llvm namespace
    165 
    166 namespace {
    167 
    168   class AMDGPUSimplifyLibCalls : public FunctionPass {
    169 
    170   AMDGPULibCalls Simplifier;
    171 
    172   const TargetOptions Options;
    173 
    174   public:
    175     static char ID; // Pass identification
    176 
    177     AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
    178       : FunctionPass(ID), Options(Opt) {
    179       initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
    180     }
    181 
    182     void getAnalysisUsage(AnalysisUsage &AU) const override {
    183       AU.addRequired<AAResultsWrapperPass>();
    184     }
    185 
    186     bool runOnFunction(Function &M) override;
    187   };
    188 
    189   class AMDGPUUseNativeCalls : public FunctionPass {
    190 
    191   AMDGPULibCalls Simplifier;
    192 
    193   public:
    194     static char ID; // Pass identification
    195 
    196     AMDGPUUseNativeCalls() : FunctionPass(ID) {
    197       initializeAMDGPUUseNativeCallsPass(*PassRegistry::getPassRegistry());
    198       Simplifier.initNativeFuncs();
    199     }
    200 
    201     bool runOnFunction(Function &F) override;
    202   };
    203 
    204 } // end anonymous namespace.
    205 
    206 char AMDGPUSimplifyLibCalls::ID = 0;
    207 char AMDGPUUseNativeCalls::ID = 0;
    208 
    209 INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
    210                       "Simplify well-known AMD library calls", false, false)
    211 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
    212 INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
    213                     "Simplify well-known AMD library calls", false, false)
    214 
    215 INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative",
    216                 "Replace builtin math calls with that native versions.",
    217                 false, false)
    218 
    219 template <typename IRB>
    220 static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg,
    221                               const Twine &Name = "") {
    222   CallInst *R = B.CreateCall(Callee, Arg, Name);
    223   if (Function* F = dyn_cast<Function>(Callee))
    224     R->setCallingConv(F->getCallingConv());
    225   return R;
    226 }
    227 
    228 template <typename IRB>
    229 static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2,
    230                                const Twine &Name = "") {
    231   CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
    232   if (Function* F = dyn_cast<Function>(Callee))
    233     R->setCallingConv(F->getCallingConv());
    234   return R;
    235 }
    236 
    237 //  Data structures for table-driven optimizations.
    238 //  FuncTbl works for both f32 and f64 functions with 1 input argument
    239 
    240 struct TableEntry {
    241   double   result;
    242   double   input;
    243 };
    244 
    245 /* a list of {result, input} */
    246 static const TableEntry tbl_acos[] = {
    247   {MATH_PI/2.0, 0.0},
    248   {MATH_PI/2.0, -0.0},
    249   {0.0, 1.0},
    250   {MATH_PI, -1.0}
    251 };
    252 static const TableEntry tbl_acosh[] = {
    253   {0.0, 1.0}
    254 };
    255 static const TableEntry tbl_acospi[] = {
    256   {0.5, 0.0},
    257   {0.5, -0.0},
    258   {0.0, 1.0},
    259   {1.0, -1.0}
    260 };
    261 static const TableEntry tbl_asin[] = {
    262   {0.0, 0.0},
    263   {-0.0, -0.0},
    264   {MATH_PI/2.0, 1.0},
    265   {-MATH_PI/2.0, -1.0}
    266 };
    267 static const TableEntry tbl_asinh[] = {
    268   {0.0, 0.0},
    269   {-0.0, -0.0}
    270 };
    271 static const TableEntry tbl_asinpi[] = {
    272   {0.0, 0.0},
    273   {-0.0, -0.0},
    274   {0.5, 1.0},
    275   {-0.5, -1.0}
    276 };
    277 static const TableEntry tbl_atan[] = {
    278   {0.0, 0.0},
    279   {-0.0, -0.0},
    280   {MATH_PI/4.0, 1.0},
    281   {-MATH_PI/4.0, -1.0}
    282 };
    283 static const TableEntry tbl_atanh[] = {
    284   {0.0, 0.0},
    285   {-0.0, -0.0}
    286 };
    287 static const TableEntry tbl_atanpi[] = {
    288   {0.0, 0.0},
    289   {-0.0, -0.0},
    290   {0.25, 1.0},
    291   {-0.25, -1.0}
    292 };
    293 static const TableEntry tbl_cbrt[] = {
    294   {0.0, 0.0},
    295   {-0.0, -0.0},
    296   {1.0, 1.0},
    297   {-1.0, -1.0},
    298 };
    299 static const TableEntry tbl_cos[] = {
    300   {1.0, 0.0},
    301   {1.0, -0.0}
    302 };
    303 static const TableEntry tbl_cosh[] = {
    304   {1.0, 0.0},
    305   {1.0, -0.0}
    306 };
    307 static const TableEntry tbl_cospi[] = {
    308   {1.0, 0.0},
    309   {1.0, -0.0}
    310 };
    311 static const TableEntry tbl_erfc[] = {
    312   {1.0, 0.0},
    313   {1.0, -0.0}
    314 };
    315 static const TableEntry tbl_erf[] = {
    316   {0.0, 0.0},
    317   {-0.0, -0.0}
    318 };
    319 static const TableEntry tbl_exp[] = {
    320   {1.0, 0.0},
    321   {1.0, -0.0},
    322   {MATH_E, 1.0}
    323 };
    324 static const TableEntry tbl_exp2[] = {
    325   {1.0, 0.0},
    326   {1.0, -0.0},
    327   {2.0, 1.0}
    328 };
    329 static const TableEntry tbl_exp10[] = {
    330   {1.0, 0.0},
    331   {1.0, -0.0},
    332   {10.0, 1.0}
    333 };
    334 static const TableEntry tbl_expm1[] = {
    335   {0.0, 0.0},
    336   {-0.0, -0.0}
    337 };
    338 static const TableEntry tbl_log[] = {
    339   {0.0, 1.0},
    340   {1.0, MATH_E}
    341 };
    342 static const TableEntry tbl_log2[] = {
    343   {0.0, 1.0},
    344   {1.0, 2.0}
    345 };
    346 static const TableEntry tbl_log10[] = {
    347   {0.0, 1.0},
    348   {1.0, 10.0}
    349 };
    350 static const TableEntry tbl_rsqrt[] = {
    351   {1.0, 1.0},
    352   {1.0/MATH_SQRT2, 2.0}
    353 };
    354 static const TableEntry tbl_sin[] = {
    355   {0.0, 0.0},
    356   {-0.0, -0.0}
    357 };
    358 static const TableEntry tbl_sinh[] = {
    359   {0.0, 0.0},
    360   {-0.0, -0.0}
    361 };
    362 static const TableEntry tbl_sinpi[] = {
    363   {0.0, 0.0},
    364   {-0.0, -0.0}
    365 };
    366 static const TableEntry tbl_sqrt[] = {
    367   {0.0, 0.0},
    368   {1.0, 1.0},
    369   {MATH_SQRT2, 2.0}
    370 };
    371 static const TableEntry tbl_tan[] = {
    372   {0.0, 0.0},
    373   {-0.0, -0.0}
    374 };
    375 static const TableEntry tbl_tanh[] = {
    376   {0.0, 0.0},
    377   {-0.0, -0.0}
    378 };
    379 static const TableEntry tbl_tanpi[] = {
    380   {0.0, 0.0},
    381   {-0.0, -0.0}
    382 };
    383 static const TableEntry tbl_tgamma[] = {
    384   {1.0, 1.0},
    385   {1.0, 2.0},
    386   {2.0, 3.0},
    387   {6.0, 4.0}
    388 };
    389 
    390 static bool HasNative(AMDGPULibFunc::EFuncId id) {
    391   switch(id) {
    392   case AMDGPULibFunc::EI_DIVIDE:
    393   case AMDGPULibFunc::EI_COS:
    394   case AMDGPULibFunc::EI_EXP:
    395   case AMDGPULibFunc::EI_EXP2:
    396   case AMDGPULibFunc::EI_EXP10:
    397   case AMDGPULibFunc::EI_LOG:
    398   case AMDGPULibFunc::EI_LOG2:
    399   case AMDGPULibFunc::EI_LOG10:
    400   case AMDGPULibFunc::EI_POWR:
    401   case AMDGPULibFunc::EI_RECIP:
    402   case AMDGPULibFunc::EI_RSQRT:
    403   case AMDGPULibFunc::EI_SIN:
    404   case AMDGPULibFunc::EI_SINCOS:
    405   case AMDGPULibFunc::EI_SQRT:
    406   case AMDGPULibFunc::EI_TAN:
    407     return true;
    408   default:;
    409   }
    410   return false;
    411 }
    412 
    413 struct TableRef {
    414   size_t size;
    415   const TableEntry *table; // variable size: from 0 to (size - 1)
    416 
    417   TableRef() : size(0), table(nullptr) {}
    418 
    419   template <size_t N>
    420   TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {}
    421 };
    422 
    423 static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
    424   switch(id) {
    425   case AMDGPULibFunc::EI_ACOS:    return TableRef(tbl_acos);
    426   case AMDGPULibFunc::EI_ACOSH:   return TableRef(tbl_acosh);
    427   case AMDGPULibFunc::EI_ACOSPI:  return TableRef(tbl_acospi);
    428   case AMDGPULibFunc::EI_ASIN:    return TableRef(tbl_asin);
    429   case AMDGPULibFunc::EI_ASINH:   return TableRef(tbl_asinh);
    430   case AMDGPULibFunc::EI_ASINPI:  return TableRef(tbl_asinpi);
    431   case AMDGPULibFunc::EI_ATAN:    return TableRef(tbl_atan);
    432   case AMDGPULibFunc::EI_ATANH:   return TableRef(tbl_atanh);
    433   case AMDGPULibFunc::EI_ATANPI:  return TableRef(tbl_atanpi);
    434   case AMDGPULibFunc::EI_CBRT:    return TableRef(tbl_cbrt);
    435   case AMDGPULibFunc::EI_NCOS:
    436   case AMDGPULibFunc::EI_COS:     return TableRef(tbl_cos);
    437   case AMDGPULibFunc::EI_COSH:    return TableRef(tbl_cosh);
    438   case AMDGPULibFunc::EI_COSPI:   return TableRef(tbl_cospi);
    439   case AMDGPULibFunc::EI_ERFC:    return TableRef(tbl_erfc);
    440   case AMDGPULibFunc::EI_ERF:     return TableRef(tbl_erf);
    441   case AMDGPULibFunc::EI_EXP:     return TableRef(tbl_exp);
    442   case AMDGPULibFunc::EI_NEXP2:
    443   case AMDGPULibFunc::EI_EXP2:    return TableRef(tbl_exp2);
    444   case AMDGPULibFunc::EI_EXP10:   return TableRef(tbl_exp10);
    445   case AMDGPULibFunc::EI_EXPM1:   return TableRef(tbl_expm1);
    446   case AMDGPULibFunc::EI_LOG:     return TableRef(tbl_log);
    447   case AMDGPULibFunc::EI_NLOG2:
    448   case AMDGPULibFunc::EI_LOG2:    return TableRef(tbl_log2);
    449   case AMDGPULibFunc::EI_LOG10:   return TableRef(tbl_log10);
    450   case AMDGPULibFunc::EI_NRSQRT:
    451   case AMDGPULibFunc::EI_RSQRT:   return TableRef(tbl_rsqrt);
    452   case AMDGPULibFunc::EI_NSIN:
    453   case AMDGPULibFunc::EI_SIN:     return TableRef(tbl_sin);
    454   case AMDGPULibFunc::EI_SINH:    return TableRef(tbl_sinh);
    455   case AMDGPULibFunc::EI_SINPI:   return TableRef(tbl_sinpi);
    456   case AMDGPULibFunc::EI_NSQRT:
    457   case AMDGPULibFunc::EI_SQRT:    return TableRef(tbl_sqrt);
    458   case AMDGPULibFunc::EI_TAN:     return TableRef(tbl_tan);
    459   case AMDGPULibFunc::EI_TANH:    return TableRef(tbl_tanh);
    460   case AMDGPULibFunc::EI_TANPI:   return TableRef(tbl_tanpi);
    461   case AMDGPULibFunc::EI_TGAMMA:  return TableRef(tbl_tgamma);
    462   default:;
    463   }
    464   return TableRef();
    465 }
    466 
    467 static inline int getVecSize(const AMDGPULibFunc& FInfo) {
    468   return FInfo.getLeads()[0].VectorSize;
    469 }
    470 
    471 static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
    472   return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
    473 }
    474 
    475 Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) {
    476   // If we are doing PreLinkOpt, the function is external. So it is safe to
    477   // use getOrInsertFunction() at this stage.
    478 
    479   return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
    480                        : AMDGPULibFunc::getFunction(M, fInfo);
    481 }
    482 
    483 bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName,
    484                                     FuncInfo *FInfo) {
    485   return AMDGPULibFunc::parse(FMangledName, *FInfo);
    486 }
    487 
    488 bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
    489   if (auto Op = dyn_cast<FPMathOperator>(CI))
    490     if (Op->isFast())
    491       return true;
    492   const Function *F = CI->getParent()->getParent();
    493   Attribute Attr = F->getFnAttribute("unsafe-fp-math");
    494   return Attr.getValueAsString() == "true";
    495 }
    496 
    497 bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
    498   return AllNative ||
    499          std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end();
    500 }
    501 
    502 void AMDGPULibCalls::initNativeFuncs() {
    503   AllNative = useNativeFunc("all") ||
    504               (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
    505                UseNative.begin()->empty());
    506 }
    507 
    508 bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
    509   bool native_sin = useNativeFunc("sin");
    510   bool native_cos = useNativeFunc("cos");
    511 
    512   if (native_sin && native_cos) {
    513     Module *M = aCI->getModule();
    514     Value *opr0 = aCI->getArgOperand(0);
    515 
    516     AMDGPULibFunc nf;
    517     nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
    518     nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
    519 
    520     nf.setPrefix(AMDGPULibFunc::NATIVE);
    521     nf.setId(AMDGPULibFunc::EI_SIN);
    522     Constant *sinExpr = getFunction(M, nf);
    523 
    524     nf.setPrefix(AMDGPULibFunc::NATIVE);
    525     nf.setId(AMDGPULibFunc::EI_COS);
    526     Constant *cosExpr = getFunction(M, nf);
    527     if (sinExpr && cosExpr) {
    528       Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
    529       Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
    530       new StoreInst(cosval, aCI->getArgOperand(1), aCI);
    531 
    532       DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
    533                                           << " with native version of sin/cos");
    534 
    535       replaceCall(sinval);
    536       return true;
    537     }
    538   }
    539   return false;
    540 }
    541 
    542 bool AMDGPULibCalls::useNative(CallInst *aCI) {
    543   CI = aCI;
    544   Function *Callee = aCI->getCalledFunction();
    545 
    546   FuncInfo FInfo;
    547   if (!parseFunctionName(Callee->getName(), &FInfo) || !FInfo.isMangled() ||
    548       FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
    549       getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
    550       !(AllNative || useNativeFunc(FInfo.getName()))) {
    551     return false;
    552   }
    553 
    554   if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
    555     return sincosUseNative(aCI, FInfo);
    556 
    557   FInfo.setPrefix(AMDGPULibFunc::NATIVE);
    558   Constant *F = getFunction(aCI->getModule(), FInfo);
    559   if (!F)
    560     return false;
    561 
    562   aCI->setCalledFunction(F);
    563   DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
    564                                       << " with native version");
    565   return true;
    566 }
    567 
    568 // Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
    569 // builtin, with appended type size and alignment arguments, where 2 or 4
    570 // indicates the original number of arguments. The library has optimized version
    571 // of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
    572 // power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
    573 // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
    574 // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
    575 bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
    576                                           FuncInfo &FInfo) {
    577   auto *Callee = CI->getCalledFunction();
    578   if (!Callee->isDeclaration())
    579     return false;
    580 
    581   assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
    582   auto *M = Callee->getParent();
    583   auto &Ctx = M->getContext();
    584   std::string Name = Callee->getName();
    585   auto NumArg = CI->getNumArgOperands();
    586   if (NumArg != 4 && NumArg != 6)
    587     return false;
    588   auto *PacketSize = CI->getArgOperand(NumArg - 2);
    589   auto *PacketAlign = CI->getArgOperand(NumArg - 1);
    590   if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
    591     return false;
    592   unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
    593   unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue();
    594   if (Size != Align || !isPowerOf2_32(Size))
    595     return false;
    596 
    597   Type *PtrElemTy;
    598   if (Size <= 8)
    599     PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
    600   else
    601     PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8);
    602   unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
    603   auto PtrArg = CI->getArgOperand(PtrArgLoc);
    604   unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
    605   auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS);
    606 
    607   SmallVector<llvm::Type *, 6> ArgTys;
    608   for (unsigned I = 0; I != PtrArgLoc; ++I)
    609     ArgTys.push_back(CI->getArgOperand(I)->getType());
    610   ArgTys.push_back(PtrTy);
    611 
    612   Name = Name + "_" + std::to_string(Size);
    613   auto *FTy = FunctionType::get(Callee->getReturnType(),
    614                                 ArrayRef<Type *>(ArgTys), false);
    615   AMDGPULibFunc NewLibFunc(Name, FTy);
    616   auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
    617   if (!F)
    618     return false;
    619 
    620   auto *BCast = B.CreatePointerCast(PtrArg, PtrTy);
    621   SmallVector<Value *, 6> Args;
    622   for (unsigned I = 0; I != PtrArgLoc; ++I)
    623     Args.push_back(CI->getArgOperand(I));
    624   Args.push_back(BCast);
    625 
    626   auto *NCI = B.CreateCall(F, Args);
    627   NCI->setAttributes(CI->getAttributes());
    628   CI->replaceAllUsesWith(NCI);
    629   CI->dropAllReferences();
    630   CI->eraseFromParent();
    631 
    632   return true;
    633 }
    634 
    635 // This function returns false if no change; return true otherwise.
    636 bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
    637   this->CI = CI;
    638   Function *Callee = CI->getCalledFunction();
    639 
    640   // Ignore indirect calls.
    641   if (Callee == 0) return false;
    642 
    643   FuncInfo FInfo;
    644   if (!parseFunctionName(Callee->getName(), &FInfo))
    645     return false;
    646 
    647   // Further check the number of arguments to see if they match.
    648   if (CI->getNumArgOperands() != FInfo.getNumArgs())
    649     return false;
    650 
    651   BasicBlock *BB = CI->getParent();
    652   LLVMContext &Context = CI->getParent()->getContext();
    653   IRBuilder<> B(Context);
    654 
    655   // Set the builder to the instruction after the call.
    656   B.SetInsertPoint(BB, CI->getIterator());
    657 
    658   // Copy fast flags from the original call.
    659   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
    660     B.setFastMathFlags(FPOp->getFastMathFlags());
    661 
    662   if (TDOFold(CI, FInfo))
    663     return true;
    664 
    665   // Under unsafe-math, evaluate calls if possible.
    666   // According to Brian Sumner, we can do this for all f32 function calls
    667   // using host's double function calls.
    668   if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))
    669     return true;
    670 
    671   // Specilized optimizations for each function call
    672   switch (FInfo.getId()) {
    673   case AMDGPULibFunc::EI_RECIP:
    674     // skip vector function
    675     assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
    676              FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
    677             "recip must be an either native or half function");
    678     return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo);
    679 
    680   case AMDGPULibFunc::EI_DIVIDE:
    681     // skip vector function
    682     assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
    683              FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
    684             "divide must be an either native or half function");
    685     return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo);
    686 
    687   case AMDGPULibFunc::EI_POW:
    688   case AMDGPULibFunc::EI_POWR:
    689   case AMDGPULibFunc::EI_POWN:
    690     return fold_pow(CI, B, FInfo);
    691 
    692   case AMDGPULibFunc::EI_ROOTN:
    693     // skip vector function
    694     return (getVecSize(FInfo) != 1) ? false : fold_rootn(CI, B, FInfo);
    695 
    696   case AMDGPULibFunc::EI_FMA:
    697   case AMDGPULibFunc::EI_MAD:
    698   case AMDGPULibFunc::EI_NFMA:
    699     // skip vector function
    700     return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
    701 
    702   case AMDGPULibFunc::EI_SQRT:
    703     return isUnsafeMath(CI) && fold_sqrt(CI, B, FInfo);
    704   case AMDGPULibFunc::EI_COS:
    705   case AMDGPULibFunc::EI_SIN:
    706     if ((getArgType(FInfo) == AMDGPULibFunc::F32 ||
    707          getArgType(FInfo) == AMDGPULibFunc::F64)
    708         && (FInfo.getPrefix() == AMDGPULibFunc::NOPFX))
    709       return fold_sincos(CI, B, AA);
    710 
    711     break;
    712   case AMDGPULibFunc::EI_READ_PIPE_2:
    713   case AMDGPULibFunc::EI_READ_PIPE_4:
    714   case AMDGPULibFunc::EI_WRITE_PIPE_2:
    715   case AMDGPULibFunc::EI_WRITE_PIPE_4:
    716     return fold_read_write_pipe(CI, B, FInfo);
    717 
    718   default:
    719     break;
    720   }
    721 
    722   return false;
    723 }
    724 
    725 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
    726   // Table-Driven optimization
    727   const TableRef tr = getOptTable(FInfo.getId());
    728   if (tr.size==0)
    729     return false;
    730 
    731   int const sz = (int)tr.size;
    732   const TableEntry * const ftbl = tr.table;
    733   Value *opr0 = CI->getArgOperand(0);
    734 
    735   if (getVecSize(FInfo) > 1) {
    736     if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) {
    737       SmallVector<double, 0> DVal;
    738       for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
    739         ConstantFP *eltval = dyn_cast<ConstantFP>(
    740                                CV->getElementAsConstant((unsigned)eltNo));
    741         assert(eltval && "Non-FP arguments in math function!");
    742         bool found = false;
    743         for (int i=0; i < sz; ++i) {
    744           if (eltval->isExactlyValue(ftbl[i].input)) {
    745             DVal.push_back(ftbl[i].result);
    746             found = true;
    747             break;
    748           }
    749         }
    750         if (!found) {
    751           // This vector constants not handled yet.
    752           return false;
    753         }
    754       }
    755       LLVMContext &context = CI->getParent()->getParent()->getContext();
    756       Constant *nval;
    757       if (getArgType(FInfo) == AMDGPULibFunc::F32) {
    758         SmallVector<float, 0> FVal;
    759         for (unsigned i = 0; i < DVal.size(); ++i) {
    760           FVal.push_back((float)DVal[i]);
    761         }
    762         ArrayRef<float> tmp(FVal);
    763         nval = ConstantDataVector::get(context, tmp);
    764       } else { // F64
    765         ArrayRef<double> tmp(DVal);
    766         nval = ConstantDataVector::get(context, tmp);
    767       }
    768       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
    769       replaceCall(nval);
    770       return true;
    771     }
    772   } else {
    773     // Scalar version
    774     if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
    775       for (int i = 0; i < sz; ++i) {
    776         if (CF->isExactlyValue(ftbl[i].input)) {
    777           Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
    778           LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
    779           replaceCall(nval);
    780           return true;
    781         }
    782       }
    783     }
    784   }
    785 
    786   return false;
    787 }
    788 
    789 bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
    790   Module *M = CI->getModule();
    791   if (getArgType(FInfo) != AMDGPULibFunc::F32 ||
    792       FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
    793       !HasNative(FInfo.getId()))
    794     return false;
    795 
    796   AMDGPULibFunc nf = FInfo;
    797   nf.setPrefix(AMDGPULibFunc::NATIVE);
    798   if (Constant *FPExpr = getFunction(M, nf)) {
    799     LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
    800 
    801     CI->setCalledFunction(FPExpr);
    802 
    803     LLVM_DEBUG(dbgs() << *CI << '\n');
    804 
    805     return true;
    806   }
    807   return false;
    808 }
    809 
    810 //  [native_]half_recip(c) ==> 1.0/c
    811 bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
    812                                 const FuncInfo &FInfo) {
    813   Value *opr0 = CI->getArgOperand(0);
    814   if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
    815     // Just create a normal div. Later, InstCombine will be able
    816     // to compute the divide into a constant (avoid check float infinity
    817     // or subnormal at this point).
    818     Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
    819                                opr0,
    820                                "recip2div");
    821     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
    822     replaceCall(nval);
    823     return true;
    824   }
    825   return false;
    826 }
    827 
    828 //  [native_]half_divide(x, c) ==> x/c
    829 bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
    830                                  const FuncInfo &FInfo) {
    831   Value *opr0 = CI->getArgOperand(0);
    832   Value *opr1 = CI->getArgOperand(1);
    833   ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
    834   ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
    835 
    836   if ((CF0 && CF1) ||  // both are constants
    837       (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32)))
    838       // CF1 is constant && f32 divide
    839   {
    840     Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0),
    841                                 opr1, "__div2recip");
    842     Value *nval  = B.CreateFMul(opr0, nval1, "__div2mul");
    843     replaceCall(nval);
    844     return true;
    845   }
    846   return false;
    847 }
    848 
    849 namespace llvm {
    850 static double log2(double V) {
    851 #if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L
    852   return ::log2(V);
    853 #else
    854   return log(V) / 0.693147180559945309417;
    855 #endif
    856 }
    857 }
    858 
    859 bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
    860                               const FuncInfo &FInfo) {
    861   assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
    862           FInfo.getId() == AMDGPULibFunc::EI_POWR ||
    863           FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
    864          "fold_pow: encounter a wrong function call");
    865 
    866   Value *opr0, *opr1;
    867   ConstantFP *CF;
    868   ConstantInt *CINT;
    869   ConstantAggregateZero *CZero;
    870   Type *eltType;
    871 
    872   opr0 = CI->getArgOperand(0);
    873   opr1 = CI->getArgOperand(1);
    874   CZero = dyn_cast<ConstantAggregateZero>(opr1);
    875   if (getVecSize(FInfo) == 1) {
    876     eltType = opr0->getType();
    877     CF = dyn_cast<ConstantFP>(opr1);
    878     CINT = dyn_cast<ConstantInt>(opr1);
    879   } else {
    880     VectorType *VTy = dyn_cast<VectorType>(opr0->getType());
    881     assert(VTy && "Oprand of vector function should be of vectortype");
    882     eltType = VTy->getElementType();
    883     ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1);
    884 
    885     // Now, only Handle vector const whose elements have the same value.
    886     CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->getSplatValue()) : nullptr;
    887     CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->getSplatValue()) : nullptr;
    888   }
    889 
    890   // No unsafe math , no constant argument, do nothing
    891   if (!isUnsafeMath(CI) && !CF && !CINT && !CZero)
    892     return false;
    893 
    894   // 0x1111111 means that we don't do anything for this call.
    895   int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
    896 
    897   if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
    898     //  pow/powr/pown(x, 0) == 1
    899     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
    900     Constant *cnval = ConstantFP::get(eltType, 1.0);
    901     if (getVecSize(FInfo) > 1) {
    902       cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
    903     }
    904     replaceCall(cnval);
    905     return true;
    906   }
    907   if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
    908     // pow/powr/pown(x, 1.0) = x
    909     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
    910     replaceCall(opr0);
    911     return true;
    912   }
    913   if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
    914     // pow/powr/pown(x, 2.0) = x*x
    915     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0
    916                       << "\n");
    917     Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
    918     replaceCall(nval);
    919     return true;
    920   }
    921   if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
    922     // pow/powr/pown(x, -1.0) = 1.0/x
    923     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n");
    924     Constant *cnval = ConstantFP::get(eltType, 1.0);
    925     if (getVecSize(FInfo) > 1) {
    926       cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
    927     }
    928     Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
    929     replaceCall(nval);
    930     return true;
    931   }
    932 
    933   Module *M = CI->getModule();
    934   if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
    935     // pow[r](x, [-]0.5) = sqrt(x)
    936     bool issqrt = CF->isExactlyValue(0.5);
    937     if (Constant *FPExpr = getFunction(M,
    938         AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
    939                              : AMDGPULibFunc::EI_RSQRT, FInfo))) {
    940       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
    941                         << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
    942       Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
    943                                                         : "__pow2rsqrt");
    944       replaceCall(nval);
    945       return true;
    946     }
    947   }
    948 
    949   if (!isUnsafeMath(CI))
    950     return false;
    951 
    952   // Unsafe Math optimization
    953 
    954   // Remember that ci_opr1 is set if opr1 is integral
    955   if (CF) {
    956     double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
    957                     ? (double)CF->getValueAPF().convertToFloat()
    958                     : CF->getValueAPF().convertToDouble();
    959     int ival = (int)dval;
    960     if ((double)ival == dval) {
    961       ci_opr1 = ival;
    962     } else
    963       ci_opr1 = 0x11111111;
    964   }
    965 
    966   // pow/powr/pown(x, c) = [1/](x*x*..x); where
    967   //   trunc(c) == c && the number of x == c && |c| <= 12
    968   unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
    969   if (abs_opr1 <= 12) {
    970     Constant *cnval;
    971     Value *nval;
    972     if (abs_opr1 == 0) {
    973       cnval = ConstantFP::get(eltType, 1.0);
    974       if (getVecSize(FInfo) > 1) {
    975         cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
    976       }
    977       nval = cnval;
    978     } else {
    979       Value *valx2 = nullptr;
    980       nval = nullptr;
    981       while (abs_opr1 > 0) {
    982         valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0;
    983         if (abs_opr1 & 1) {
    984           nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2;
    985         }
    986         abs_opr1 >>= 1;
    987       }
    988     }
    989 
    990     if (ci_opr1 < 0) {
    991       cnval = ConstantFP::get(eltType, 1.0);
    992       if (getVecSize(FInfo) > 1) {
    993         cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
    994       }
    995       nval = B.CreateFDiv(cnval, nval, "__1powprod");
    996     }
    997     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
    998                       << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
    999                       << ")\n");
   1000     replaceCall(nval);
   1001     return true;
   1002   }
   1003 
   1004   // powr ---> exp2(y * log2(x))
   1005   // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
   1006   Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2,
   1007                                                    FInfo));
   1008   if (!ExpExpr)
   1009     return false;
   1010 
   1011   bool needlog = false;
   1012   bool needabs = false;
   1013   bool needcopysign = false;
   1014   Constant *cnval = nullptr;
   1015   if (getVecSize(FInfo) == 1) {
   1016     CF = dyn_cast<ConstantFP>(opr0);
   1017 
   1018     if (CF) {
   1019       double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
   1020                    ? (double)CF->getValueAPF().convertToFloat()
   1021                    : CF->getValueAPF().convertToDouble();
   1022 
   1023       V = log2(std::abs(V));
   1024       cnval = ConstantFP::get(eltType, V);
   1025       needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
   1026                      CF->isNegative();
   1027     } else {
   1028       needlog = true;
   1029       needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
   1030                                (!CF || CF->isNegative());
   1031     }
   1032   } else {
   1033     ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
   1034 
   1035     if (!CDV) {
   1036       needlog = true;
   1037       needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
   1038     } else {
   1039       assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
   1040               "Wrong vector size detected");
   1041 
   1042       SmallVector<double, 0> DVal;
   1043       for (int i=0; i < getVecSize(FInfo); ++i) {
   1044         double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
   1045                      ? (double)CDV->getElementAsFloat(i)
   1046                      : CDV->getElementAsDouble(i);
   1047         if (V < 0.0) needcopysign = true;
   1048         V = log2(std::abs(V));
   1049         DVal.push_back(V);
   1050       }
   1051       if (getArgType(FInfo) == AMDGPULibFunc::F32) {
   1052         SmallVector<float, 0> FVal;
   1053         for (unsigned i=0; i < DVal.size(); ++i) {
   1054           FVal.push_back((float)DVal[i]);
   1055         }
   1056         ArrayRef<float> tmp(FVal);
   1057         cnval = ConstantDataVector::get(M->getContext(), tmp);
   1058       } else {
   1059         ArrayRef<double> tmp(DVal);
   1060         cnval = ConstantDataVector::get(M->getContext(), tmp);
   1061       }
   1062     }
   1063   }
   1064 
   1065   if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
   1066     // We cannot handle corner cases for a general pow() function, give up
   1067     // unless y is a constant integral value. Then proceed as if it were pown.
   1068     if (getVecSize(FInfo) == 1) {
   1069       if (const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) {
   1070         double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
   1071                    ? (double)CF->getValueAPF().convertToFloat()
   1072                    : CF->getValueAPF().convertToDouble();
   1073         if (y != (double)(int64_t)y)
   1074           return false;
   1075       } else
   1076         return false;
   1077     } else {
   1078       if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1)) {
   1079         for (int i=0; i < getVecSize(FInfo); ++i) {
   1080           double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
   1081                      ? (double)CDV->getElementAsFloat(i)
   1082                      : CDV->getElementAsDouble(i);
   1083           if (y != (double)(int64_t)y)
   1084             return false;
   1085         }
   1086       } else
   1087         return false;
   1088     }
   1089   }
   1090 
   1091   Value *nval;
   1092   if (needabs) {
   1093     Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS,
   1094                                                      FInfo));
   1095     if (!AbsExpr)
   1096       return false;
   1097     nval = CreateCallEx(B, AbsExpr, opr0, "__fabs");
   1098   } else {
   1099     nval = cnval ? cnval : opr0;
   1100   }
   1101   if (needlog) {
   1102     Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2,
   1103                                                      FInfo));
   1104     if (!LogExpr)
   1105       return false;
   1106     nval = CreateCallEx(B,LogExpr, nval, "__log2");
   1107   }
   1108 
   1109   if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
   1110     // convert int(32) to fp(f32 or f64)
   1111     opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
   1112   }
   1113   nval = B.CreateFMul(opr1, nval, "__ylogx");
   1114   nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
   1115 
   1116   if (needcopysign) {
   1117     Value *opr_n;
   1118     Type* rTy = opr0->getType();
   1119     Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty();
   1120     Type *nTy = nTyS;
   1121     if (const VectorType *vTy = dyn_cast<VectorType>(rTy))
   1122       nTy = VectorType::get(nTyS, vTy->getNumElements());
   1123     unsigned size = nTy->getScalarSizeInBits();
   1124     opr_n = CI->getArgOperand(1);
   1125     if (opr_n->getType()->isIntegerTy())
   1126       opr_n = B.CreateZExtOrBitCast(opr_n, nTy, "__ytou");
   1127     else
   1128       opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
   1129 
   1130     Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
   1131     sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
   1132     nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
   1133     nval = B.CreateBitCast(nval, opr0->getType());
   1134   }
   1135 
   1136   LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
   1137                     << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
   1138   replaceCall(nval);
   1139 
   1140   return true;
   1141 }
   1142 
   1143 bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
   1144                                 const FuncInfo &FInfo) {
   1145   Value *opr0 = CI->getArgOperand(0);
   1146   Value *opr1 = CI->getArgOperand(1);
   1147 
   1148   ConstantInt *CINT = dyn_cast<ConstantInt>(opr1);
   1149   if (!CINT) {
   1150     return false;
   1151   }
   1152   int ci_opr1 = (int)CINT->getSExtValue();
   1153   if (ci_opr1 == 1) {  // rootn(x, 1) = x
   1154     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
   1155     replaceCall(opr0);
   1156     return true;
   1157   }
   1158   if (ci_opr1 == 2) {  // rootn(x, 2) = sqrt(x)
   1159     std::vector<const Type*> ParamsTys;
   1160     ParamsTys.push_back(opr0->getType());
   1161     Module *M = CI->getModule();
   1162     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
   1163                                                         FInfo))) {
   1164       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
   1165       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
   1166       replaceCall(nval);
   1167       return true;
   1168     }
   1169   } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
   1170     Module *M = CI->getModule();
   1171     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
   1172                                                         FInfo))) {
   1173       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
   1174       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
   1175       replaceCall(nval);
   1176       return true;
   1177     }
   1178   } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
   1179     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
   1180     Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
   1181                                opr0,
   1182                                "__rootn2div");
   1183     replaceCall(nval);
   1184     return true;
   1185   } else if (ci_opr1 == -2) {  // rootn(x, -2) = rsqrt(x)
   1186     std::vector<const Type*> ParamsTys;
   1187     ParamsTys.push_back(opr0->getType());
   1188     Module *M = CI->getModule();
   1189     if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
   1190                                                         FInfo))) {
   1191       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
   1192                         << ")\n");
   1193       Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
   1194       replaceCall(nval);
   1195       return true;
   1196     }
   1197   }
   1198   return false;
   1199 }
   1200 
   1201 bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
   1202                                   const FuncInfo &FInfo) {
   1203   Value *opr0 = CI->getArgOperand(0);
   1204   Value *opr1 = CI->getArgOperand(1);
   1205   Value *opr2 = CI->getArgOperand(2);
   1206 
   1207   ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
   1208   ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
   1209   if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
   1210     // fma/mad(a, b, c) = c if a=0 || b=0
   1211     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
   1212     replaceCall(opr2);
   1213     return true;
   1214   }
   1215   if (CF0 && CF0->isExactlyValue(1.0f)) {
   1216     // fma/mad(a, b, c) = b+c if a=1
   1217     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
   1218                       << "\n");
   1219     Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
   1220     replaceCall(nval);
   1221     return true;
   1222   }
   1223   if (CF1 && CF1->isExactlyValue(1.0f)) {
   1224     // fma/mad(a, b, c) = a+c if b=1
   1225     LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
   1226                       << "\n");
   1227     Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
   1228     replaceCall(nval);
   1229     return true;
   1230   }
   1231   if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
   1232     if (CF->isZero()) {
   1233       // fma/mad(a, b, c) = a*b if c=0
   1234       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
   1235                         << *opr1 << "\n");
   1236       Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
   1237       replaceCall(nval);
   1238       return true;
   1239     }
   1240   }
   1241 
   1242   return false;
   1243 }
   1244 
   1245 // Get a scalar native builtin signle argument FP function
   1246 Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) {
   1247   if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
   1248     return nullptr;
   1249   FuncInfo nf = FInfo;
   1250   nf.setPrefix(AMDGPULibFunc::NATIVE);
   1251   return getFunction(M, nf);
   1252 }
   1253 
   1254 // fold sqrt -> native_sqrt (x)
   1255 bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
   1256                                const FuncInfo &FInfo) {
   1257   if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) &&
   1258       (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
   1259     if (Constant *FPExpr = getNativeFunction(
   1260         CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
   1261       Value *opr0 = CI->getArgOperand(0);
   1262       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
   1263                         << "sqrt(" << *opr0 << ")\n");
   1264       Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
   1265       replaceCall(nval);
   1266       return true;
   1267     }
   1268   }
   1269   return false;
   1270 }
   1271 
   1272 // fold sin, cos -> sincos.
   1273 bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   1274                                  AliasAnalysis *AA) {
   1275   AMDGPULibFunc fInfo;
   1276   if (!AMDGPULibFunc::parse(CI->getCalledFunction()->getName(), fInfo))
   1277     return false;
   1278 
   1279   assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
   1280          fInfo.getId() == AMDGPULibFunc::EI_COS);
   1281   bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
   1282 
   1283   Value *CArgVal = CI->getArgOperand(0);
   1284   BasicBlock * const CBB = CI->getParent();
   1285 
   1286   int const MaxScan = 30;
   1287 
   1288   { // fold in load value.
   1289     LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
   1290     if (LI && LI->getParent() == CBB) {
   1291       BasicBlock::iterator BBI = LI->getIterator();
   1292       Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA);
   1293       if (AvailableVal) {
   1294         CArgVal->replaceAllUsesWith(AvailableVal);
   1295         if (CArgVal->getNumUses() == 0)
   1296           LI->eraseFromParent();
   1297         CArgVal = CI->getArgOperand(0);
   1298       }
   1299     }
   1300   }
   1301 
   1302   Module *M = CI->getModule();
   1303   fInfo.setId(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN);
   1304   std::string const PairName = fInfo.mangle();
   1305 
   1306   CallInst *UI = nullptr;
   1307   for (User* U : CArgVal->users()) {
   1308     CallInst *XI = dyn_cast_or_null<CallInst>(U);
   1309     if (!XI || XI == CI || XI->getParent() != CBB)
   1310       continue;
   1311 
   1312     Function *UCallee = XI->getCalledFunction();
   1313     if (!UCallee || !UCallee->getName().equals(PairName))
   1314       continue;
   1315 
   1316     BasicBlock::iterator BBI = CI->getIterator();
   1317     if (BBI == CI->getParent()->begin())
   1318       break;
   1319     --BBI;
   1320     for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
   1321       if (cast<Instruction>(BBI) == XI) {
   1322         UI = XI;
   1323         break;
   1324       }
   1325     }
   1326     if (UI) break;
   1327   }
   1328 
   1329   if (!UI) return false;
   1330 
   1331   // Merge the sin and cos.
   1332 
   1333   // for OpenCL 2.0 we have only generic implementation of sincos
   1334   // function.
   1335   AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
   1336   const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
   1337   nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
   1338   Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
   1339   if (!Fsincos) return false;
   1340 
   1341   BasicBlock::iterator ItOld = B.GetInsertPoint();
   1342   AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
   1343   B.SetInsertPoint(UI);
   1344 
   1345   Value *P = Alloc;
   1346   Type *PTy = Fsincos->getFunctionType()->getParamType(1);
   1347   // The allocaInst allocates the memory in private address space. This need
   1348   // to be bitcasted to point to the address space of cos pointer type.
   1349   // In OpenCL 2.0 this is generic, while in 1.2 that is private.
   1350   if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
   1351     P = B.CreateAddrSpaceCast(Alloc, PTy);
   1352   CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
   1353 
   1354   LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
   1355                     << *Call << "\n");
   1356 
   1357   if (!isSin) { // CI->cos, UI->sin
   1358     B.SetInsertPoint(&*ItOld);
   1359     UI->replaceAllUsesWith(&*Call);
   1360     Instruction *Reload = B.CreateLoad(Alloc);
   1361     CI->replaceAllUsesWith(Reload);
   1362     UI->eraseFromParent();
   1363     CI->eraseFromParent();
   1364   } else { // CI->sin, UI->cos
   1365     Instruction *Reload = B.CreateLoad(Alloc);
   1366     UI->replaceAllUsesWith(Reload);
   1367     CI->replaceAllUsesWith(Call);
   1368     UI->eraseFromParent();
   1369     CI->eraseFromParent();
   1370   }
   1371   return true;
   1372 }
   1373 
   1374 // Get insertion point at entry.
   1375 BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
   1376   Function * Func = UI->getParent()->getParent();
   1377   BasicBlock * BB = &Func->getEntryBlock();
   1378   assert(BB && "Entry block not found!");
   1379   BasicBlock::iterator ItNew = BB->begin();
   1380   return ItNew;
   1381 }
   1382 
   1383 // Insert a AllocsInst at the beginning of function entry block.
   1384 AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
   1385                                          const char *prefix) {
   1386   BasicBlock::iterator ItNew = getEntryIns(UI);
   1387   Function *UCallee = UI->getCalledFunction();
   1388   Type *RetType = UCallee->getReturnType();
   1389   B.SetInsertPoint(&*ItNew);
   1390   AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
   1391     std::string(prefix) + UI->getName());
   1392   Alloc->setAlignment(UCallee->getParent()->getDataLayout()
   1393                        .getTypeAllocSize(RetType));
   1394   return Alloc;
   1395 }
   1396 
   1397 bool AMDGPULibCalls::evaluateScalarMathFunc(FuncInfo &FInfo,
   1398                                             double& Res0, double& Res1,
   1399                                             Constant *copr0, Constant *copr1,
   1400                                             Constant *copr2) {
   1401   // By default, opr0/opr1/opr3 holds values of float/double type.
   1402   // If they are not float/double, each function has to its
   1403   // operand separately.
   1404   double opr0=0.0, opr1=0.0, opr2=0.0;
   1405   ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
   1406   ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
   1407   ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2);
   1408   if (fpopr0) {
   1409     opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
   1410              ? fpopr0->getValueAPF().convertToDouble()
   1411              : (double)fpopr0->getValueAPF().convertToFloat();
   1412   }
   1413 
   1414   if (fpopr1) {
   1415     opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
   1416              ? fpopr1->getValueAPF().convertToDouble()
   1417              : (double)fpopr1->getValueAPF().convertToFloat();
   1418   }
   1419 
   1420   if (fpopr2) {
   1421     opr2 = (getArgType(FInfo) == AMDGPULibFunc::F64)
   1422              ? fpopr2->getValueAPF().convertToDouble()
   1423              : (double)fpopr2->getValueAPF().convertToFloat();
   1424   }
   1425 
   1426   switch (FInfo.getId()) {
   1427   default : return false;
   1428 
   1429   case AMDGPULibFunc::EI_ACOS:
   1430     Res0 = acos(opr0);
   1431     return true;
   1432 
   1433   case AMDGPULibFunc::EI_ACOSH:
   1434     // acosh(x) == log(x + sqrt(x*x - 1))
   1435     Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
   1436     return true;
   1437 
   1438   case AMDGPULibFunc::EI_ACOSPI:
   1439     Res0 = acos(opr0) / MATH_PI;
   1440     return true;
   1441 
   1442   case AMDGPULibFunc::EI_ASIN:
   1443     Res0 = asin(opr0);
   1444     return true;
   1445 
   1446   case AMDGPULibFunc::EI_ASINH:
   1447     // asinh(x) == log(x + sqrt(x*x + 1))
   1448     Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
   1449     return true;
   1450 
   1451   case AMDGPULibFunc::EI_ASINPI:
   1452     Res0 = asin(opr0) / MATH_PI;
   1453     return true;
   1454 
   1455   case AMDGPULibFunc::EI_ATAN:
   1456     Res0 = atan(opr0);
   1457     return true;
   1458 
   1459   case AMDGPULibFunc::EI_ATANH:
   1460     // atanh(x) == (log(x+1) - log(x-1))/2;
   1461     Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
   1462     return true;
   1463 
   1464   case AMDGPULibFunc::EI_ATANPI:
   1465     Res0 = atan(opr0) / MATH_PI;
   1466     return true;
   1467 
   1468   case AMDGPULibFunc::EI_CBRT:
   1469     Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
   1470     return true;
   1471 
   1472   case AMDGPULibFunc::EI_COS:
   1473     Res0 = cos(opr0);
   1474     return true;
   1475 
   1476   case AMDGPULibFunc::EI_COSH:
   1477     Res0 = cosh(opr0);
   1478     return true;
   1479 
   1480   case AMDGPULibFunc::EI_COSPI:
   1481     Res0 = cos(MATH_PI * opr0);
   1482     return true;
   1483 
   1484   case AMDGPULibFunc::EI_EXP:
   1485     Res0 = exp(opr0);
   1486     return true;
   1487 
   1488   case AMDGPULibFunc::EI_EXP2:
   1489     Res0 = pow(2.0, opr0);
   1490     return true;
   1491 
   1492   case AMDGPULibFunc::EI_EXP10:
   1493     Res0 = pow(10.0, opr0);
   1494     return true;
   1495 
   1496   case AMDGPULibFunc::EI_EXPM1:
   1497     Res0 = exp(opr0) - 1.0;
   1498     return true;
   1499 
   1500   case AMDGPULibFunc::EI_LOG:
   1501     Res0 = log(opr0);
   1502     return true;
   1503 
   1504   case AMDGPULibFunc::EI_LOG2:
   1505     Res0 = log(opr0) / log(2.0);
   1506     return true;
   1507 
   1508   case AMDGPULibFunc::EI_LOG10:
   1509     Res0 = log(opr0) / log(10.0);
   1510     return true;
   1511 
   1512   case AMDGPULibFunc::EI_RSQRT:
   1513     Res0 = 1.0 / sqrt(opr0);
   1514     return true;
   1515 
   1516   case AMDGPULibFunc::EI_SIN:
   1517     Res0 = sin(opr0);
   1518     return true;
   1519 
   1520   case AMDGPULibFunc::EI_SINH:
   1521     Res0 = sinh(opr0);
   1522     return true;
   1523 
   1524   case AMDGPULibFunc::EI_SINPI:
   1525     Res0 = sin(MATH_PI * opr0);
   1526     return true;
   1527 
   1528   case AMDGPULibFunc::EI_SQRT:
   1529     Res0 = sqrt(opr0);
   1530     return true;
   1531 
   1532   case AMDGPULibFunc::EI_TAN:
   1533     Res0 = tan(opr0);
   1534     return true;
   1535 
   1536   case AMDGPULibFunc::EI_TANH:
   1537     Res0 = tanh(opr0);
   1538     return true;
   1539 
   1540   case AMDGPULibFunc::EI_TANPI:
   1541     Res0 = tan(MATH_PI * opr0);
   1542     return true;
   1543 
   1544   case AMDGPULibFunc::EI_RECIP:
   1545     Res0 = 1.0 / opr0;
   1546     return true;
   1547 
   1548   // two-arg functions
   1549   case AMDGPULibFunc::EI_DIVIDE:
   1550     Res0 = opr0 / opr1;
   1551     return true;
   1552 
   1553   case AMDGPULibFunc::EI_POW:
   1554   case AMDGPULibFunc::EI_POWR:
   1555     Res0 = pow(opr0, opr1);
   1556     return true;
   1557 
   1558   case AMDGPULibFunc::EI_POWN: {
   1559     if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
   1560       double val = (double)iopr1->getSExtValue();
   1561       Res0 = pow(opr0, val);
   1562       return true;
   1563     }
   1564     return false;
   1565   }
   1566 
   1567   case AMDGPULibFunc::EI_ROOTN: {
   1568     if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
   1569       double val = (double)iopr1->getSExtValue();
   1570       Res0 = pow(opr0, 1.0 / val);
   1571       return true;
   1572     }
   1573     return false;
   1574   }
   1575 
   1576   // with ptr arg
   1577   case AMDGPULibFunc::EI_SINCOS:
   1578     Res0 = sin(opr0);
   1579     Res1 = cos(opr0);
   1580     return true;
   1581 
   1582   // three-arg functions
   1583   case AMDGPULibFunc::EI_FMA:
   1584   case AMDGPULibFunc::EI_MAD:
   1585     Res0 = opr0 * opr1 + opr2;
   1586     return true;
   1587   }
   1588 
   1589   return false;
   1590 }
   1591 
   1592 bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
   1593   int numArgs = (int)aCI->getNumArgOperands();
   1594   if (numArgs > 3)
   1595     return false;
   1596 
   1597   Constant *copr0 = nullptr;
   1598   Constant *copr1 = nullptr;
   1599   Constant *copr2 = nullptr;
   1600   if (numArgs > 0) {
   1601     if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
   1602       return false;
   1603   }
   1604 
   1605   if (numArgs > 1) {
   1606     if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) {
   1607       if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
   1608         return false;
   1609     }
   1610   }
   1611 
   1612   if (numArgs > 2) {
   1613     if ((copr2 = dyn_cast<Constant>(aCI->getArgOperand(2))) == nullptr)
   1614       return false;
   1615   }
   1616 
   1617   // At this point, all arguments to aCI are constants.
   1618 
   1619   // max vector size is 16, and sincos will generate two results.
   1620   double DVal0[16], DVal1[16];
   1621   bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
   1622   if (getVecSize(FInfo) == 1) {
   1623     if (!evaluateScalarMathFunc(FInfo, DVal0[0],
   1624                                 DVal1[0], copr0, copr1, copr2)) {
   1625       return false;
   1626     }
   1627   } else {
   1628     ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
   1629     ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
   1630     ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2);
   1631     for (int i=0; i < getVecSize(FInfo); ++i) {
   1632       Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
   1633       Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
   1634       Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr;
   1635       if (!evaluateScalarMathFunc(FInfo, DVal0[i],
   1636                                   DVal1[i], celt0, celt1, celt2)) {
   1637         return false;
   1638       }
   1639     }
   1640   }
   1641 
   1642   LLVMContext &context = CI->getParent()->getParent()->getContext();
   1643   Constant *nval0, *nval1;
   1644   if (getVecSize(FInfo) == 1) {
   1645     nval0 = ConstantFP::get(CI->getType(), DVal0[0]);
   1646     if (hasTwoResults)
   1647       nval1 = ConstantFP::get(CI->getType(), DVal1[0]);
   1648   } else {
   1649     if (getArgType(FInfo) == AMDGPULibFunc::F32) {
   1650       SmallVector <float, 0> FVal0, FVal1;
   1651       for (int i=0; i < getVecSize(FInfo); ++i)
   1652         FVal0.push_back((float)DVal0[i]);
   1653       ArrayRef<float> tmp0(FVal0);
   1654       nval0 = ConstantDataVector::get(context, tmp0);
   1655       if (hasTwoResults) {
   1656         for (int i=0; i < getVecSize(FInfo); ++i)
   1657           FVal1.push_back((float)DVal1[i]);
   1658         ArrayRef<float> tmp1(FVal1);
   1659         nval1 = ConstantDataVector::get(context, tmp1);
   1660       }
   1661     } else {
   1662       ArrayRef<double> tmp0(DVal0);
   1663       nval0 = ConstantDataVector::get(context, tmp0);
   1664       if (hasTwoResults) {
   1665         ArrayRef<double> tmp1(DVal1);
   1666         nval1 = ConstantDataVector::get(context, tmp1);
   1667       }
   1668     }
   1669   }
   1670 
   1671   if (hasTwoResults) {
   1672     // sincos
   1673     assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
   1674            "math function with ptr arg not supported yet");
   1675     new StoreInst(nval1, aCI->getArgOperand(1), aCI);
   1676   }
   1677 
   1678   replaceCall(nval0);
   1679   return true;
   1680 }
   1681 
   1682 // Public interface to the Simplify LibCalls pass.
   1683 FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
   1684   return new AMDGPUSimplifyLibCalls(Opt);
   1685 }
   1686 
   1687 FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
   1688   return new AMDGPUUseNativeCalls();
   1689 }
   1690 
   1691 static bool setFastFlags(Function &F, const TargetOptions &Options) {
   1692   AttrBuilder B;
   1693 
   1694   if (Options.UnsafeFPMath || Options.NoInfsFPMath)
   1695     B.addAttribute("no-infs-fp-math", "true");
   1696   if (Options.UnsafeFPMath || Options.NoNaNsFPMath)
   1697     B.addAttribute("no-nans-fp-math", "true");
   1698   if (Options.UnsafeFPMath) {
   1699     B.addAttribute("less-precise-fpmad", "true");
   1700     B.addAttribute("unsafe-fp-math", "true");
   1701   }
   1702 
   1703   if (!B.hasAttributes())
   1704     return false;
   1705 
   1706   F.addAttributes(AttributeList::FunctionIndex, B);
   1707 
   1708   return true;
   1709 }
   1710 
   1711 bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
   1712   if (skipFunction(F))
   1713     return false;
   1714 
   1715   bool Changed = false;
   1716   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   1717 
   1718   LLVM_DEBUG(dbgs() << "AMDIC: process function ";
   1719              F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
   1720 
   1721   if (!EnablePreLink)
   1722     Changed |= setFastFlags(F, Options);
   1723 
   1724   for (auto &BB : F) {
   1725     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
   1726       // Ignore non-calls.
   1727       CallInst *CI = dyn_cast<CallInst>(I);
   1728       ++I;
   1729       if (!CI) continue;
   1730 
   1731       // Ignore indirect calls.
   1732       Function *Callee = CI->getCalledFunction();
   1733       if (Callee == 0) continue;
   1734 
   1735       LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
   1736                  dbgs().flush());
   1737       if(Simplifier.fold(CI, AA))
   1738         Changed = true;
   1739     }
   1740   }
   1741   return Changed;
   1742 }
   1743 
   1744 bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
   1745   if (skipFunction(F) || UseNative.empty())
   1746     return false;
   1747 
   1748   bool Changed = false;
   1749   for (auto &BB : F) {
   1750     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
   1751       // Ignore non-calls.
   1752       CallInst *CI = dyn_cast<CallInst>(I);
   1753       ++I;
   1754       if (!CI) continue;
   1755 
   1756       // Ignore indirect calls.
   1757       Function *Callee = CI->getCalledFunction();
   1758       if (Callee == 0) continue;
   1759 
   1760       if(Simplifier.useNative(CI))
   1761         Changed = true;
   1762     }
   1763   }
   1764   return Changed;
   1765 }
   1766