Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file builder_misc.cpp
     24 *
     25 * @brief Implementation for miscellaneous builder functions
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "builder.h"
     31 #include "common/rdtsc_buckets.h"
     32 
     33 #include <cstdarg>
     34 
     35 namespace SwrJit
     36 {
     37     void __cdecl CallPrint(const char* fmt, ...);
     38 
     39     //////////////////////////////////////////////////////////////////////////
     40     /// @brief Convert an IEEE 754 32-bit single precision float to an
     41     ///        16 bit float with 5 exponent bits and a variable
     42     ///        number of mantissa bits.
     43     /// @param val - 32-bit float
     44     /// @todo Maybe move this outside of this file into a header?
     45     static uint16_t Convert32To16Float(float val)
     46     {
     47         uint32_t sign, exp, mant;
     48         uint32_t roundBits;
     49 
     50         // Extract the sign, exponent, and mantissa
     51         uint32_t uf = *(uint32_t*)&val;
     52         sign = (uf & 0x80000000) >> 31;
     53         exp = (uf & 0x7F800000) >> 23;
     54         mant = uf & 0x007FFFFF;
     55 
     56         // Check for out of range
     57         if (std::isnan(val))
     58         {
     59             exp = 0x1F;
     60             mant = 0x200;
     61             sign = 1;                     // set the sign bit for NANs
     62         }
     63         else if (std::isinf(val))
     64         {
     65             exp = 0x1f;
     66             mant = 0x0;
     67         }
     68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
     69         {
     70             exp = 0x1E;
     71             mant = 0x3FF;
     72         }
     73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
     74         {
     75             mant |= 0x00800000;
     76             for (; exp <= 0x70; mant >>= 1, exp++)
     77                 ;
     78             exp = 0;
     79             mant = mant >> 13;
     80         }
     81         else if (exp < 0x66) // Too small to represent -> Zero
     82         {
     83             exp = 0;
     84             mant = 0;
     85         }
     86         else
     87         {
     88             // Saves bits that will be shifted off for rounding
     89             roundBits = mant & 0x1FFFu;
     90             // convert exponent and mantissa to 16 bit format
     91             exp = exp - 0x70;
     92             mant = mant >> 13;
     93 
     94             // Essentially RTZ, but round up if off by only 1 lsb
     95             if (roundBits == 0x1FFFu)
     96             {
     97                 mant++;
     98                 // check for overflow
     99                 if ((mant & 0xC00u) != 0)
    100                     exp++;
    101                 // make sure only the needed bits are used
    102                 mant &= 0x3FF;
    103             }
    104         }
    105 
    106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
    107         return (uint16_t)tmpVal;
    108     }
    109 
    110     //////////////////////////////////////////////////////////////////////////
    111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
    112     ///        float
    113     /// @param val - 16-bit float
    114     /// @todo Maybe move this outside of this file into a header?
    115     static float ConvertSmallFloatTo32(UINT val)
    116     {
    117         UINT result;
    118         if ((val & 0x7fff) == 0)
    119         {
    120             result = ((uint32_t)(val & 0x8000)) << 16;
    121         }
    122         else if ((val & 0x7c00) == 0x7c00)
    123         {
    124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
    125             result |= ((uint32_t)val & 0x8000) << 16;
    126         }
    127         else
    128         {
    129             uint32_t sign = (val & 0x8000) << 16;
    130             uint32_t mant = (val & 0x3ff) << 13;
    131             uint32_t exp = (val >> 10) & 0x1f;
    132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
    133             {
    134                 mant <<= 1;
    135                 while (mant < (0x400 << 13))
    136                 {
    137                     exp--;
    138                     mant <<= 1;
    139                 }
    140                 mant &= (0x3ff << 13);
    141             }
    142             exp = ((exp - 15 + 127) & 0xff) << 23;
    143             result = sign | exp | mant;
    144         }
    145 
    146         return *(float*)&result;
    147     }
    148 
    149     Constant *Builder::C(bool i)
    150     {
    151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
    152     }
    153 
    154     Constant *Builder::C(char i)
    155     {
    156         return ConstantInt::get(IRB()->getInt8Ty(), i);
    157     }
    158 
    159     Constant *Builder::C(uint8_t i)
    160     {
    161         return ConstantInt::get(IRB()->getInt8Ty(), i);
    162     }
    163 
    164     Constant *Builder::C(int i)
    165     {
    166         return ConstantInt::get(IRB()->getInt32Ty(), i);
    167     }
    168 
    169     Constant *Builder::C(int64_t i)
    170     {
    171         return ConstantInt::get(IRB()->getInt64Ty(), i);
    172     }
    173 
    174     Constant *Builder::C(uint16_t i)
    175     {
    176         return ConstantInt::get(mInt16Ty,i);
    177     }
    178 
    179     Constant *Builder::C(uint32_t i)
    180     {
    181         return ConstantInt::get(IRB()->getInt32Ty(), i);
    182     }
    183 
    184     Constant *Builder::C(float i)
    185     {
    186         return ConstantFP::get(IRB()->getFloatTy(), i);
    187     }
    188 
    189     Constant *Builder::PRED(bool pred)
    190     {
    191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
    192     }
    193 
    194     Value *Builder::VIMMED1(int i)
    195     {
    196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    197     }
    198 
    199     Value *Builder::VIMMED1(uint32_t i)
    200     {
    201         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    202     }
    203 
    204     Value *Builder::VIMMED1(float i)
    205     {
    206         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
    207     }
    208 
    209     Value *Builder::VIMMED1(bool i)
    210     {
    211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    212     }
    213 
    214     Value *Builder::VUNDEF_IPTR()
    215     {
    216         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
    217     }
    218 
    219     Value *Builder::VUNDEF_I()
    220     {
    221         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
    222     }
    223 
    224     Value *Builder::VUNDEF(Type *ty, uint32_t size)
    225     {
    226         return UndefValue::get(VectorType::get(ty, size));
    227     }
    228 
    229     Value *Builder::VUNDEF_F()
    230     {
    231         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
    232     }
    233 
    234     Value *Builder::VUNDEF(Type* t)
    235     {
    236         return UndefValue::get(VectorType::get(t, mVWidth));
    237     }
    238 
    239     #if HAVE_LLVM == 0x306
    240     Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
    241     {
    242         return VINSERT(vec, val, C((int64_t)index));
    243     }
    244     #endif
    245 
    246     Value *Builder::VBROADCAST(Value *src)
    247     {
    248         // check if src is already a vector
    249         if (src->getType()->isVectorTy())
    250         {
    251             return src;
    252         }
    253 
    254         return VECTOR_SPLAT(mVWidth, src);
    255     }
    256 
    257     uint32_t Builder::IMMED(Value* v)
    258     {
    259         SWR_ASSERT(isa<ConstantInt>(v));
    260         ConstantInt *pValConst = cast<ConstantInt>(v);
    261         return pValConst->getZExtValue();
    262     }
    263 
    264     int32_t Builder::S_IMMED(Value* v)
    265     {
    266         SWR_ASSERT(isa<ConstantInt>(v));
    267         ConstantInt *pValConst = cast<ConstantInt>(v);
    268         return pValConst->getSExtValue();
    269     }
    270 
    271     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
    272     {
    273         std::vector<Value*> indices;
    274         for (auto i : indexList)
    275             indices.push_back(i);
    276         return GEPA(ptr, indices);
    277     }
    278 
    279     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
    280     {
    281         std::vector<Value*> indices;
    282         for (auto i : indexList)
    283             indices.push_back(C(i));
    284         return GEPA(ptr, indices);
    285     }
    286 
    287     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
    288     {
    289         std::vector<Value*> valIndices;
    290         for (auto i : indices)
    291             valIndices.push_back(C(i));
    292         return LOAD(GEPA(basePtr, valIndices), name);
    293     }
    294 
    295     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
    296     {
    297         std::vector<Value*> valIndices;
    298         for (auto i : indices)
    299             valIndices.push_back(i);
    300         return LOAD(GEPA(basePtr, valIndices), name);
    301     }
    302 
    303     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
    304     {
    305         std::vector<Value*> valIndices;
    306         for (auto i : indices)
    307             valIndices.push_back(C(i));
    308         return STORE(val, GEPA(basePtr, valIndices));
    309     }
    310 
    311     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
    312     {
    313         std::vector<Value*> valIndices;
    314         for (auto i : indices)
    315             valIndices.push_back(i);
    316         return STORE(val, GEPA(basePtr, valIndices));
    317     }
    318 
    319     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
    320     {
    321         std::vector<Value*> args;
    322         for (auto arg : argsList)
    323             args.push_back(arg);
    324         return CALLA(Callee, args);
    325     }
    326 
    327     #if HAVE_LLVM > 0x306
    328     CallInst *Builder::CALL(Value *Callee, Value* arg)
    329     {
    330         std::vector<Value*> args;
    331         args.push_back(arg);
    332         return CALLA(Callee, args);
    333     }
    334 
    335     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
    336     {
    337         std::vector<Value*> args;
    338         args.push_back(arg1);
    339         args.push_back(arg2);
    340         return CALLA(Callee, args);
    341     }
    342 
    343     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
    344     {
    345         std::vector<Value*> args;
    346         args.push_back(arg1);
    347         args.push_back(arg2);
    348         args.push_back(arg3);
    349         return CALLA(Callee, args);
    350     }
    351     #endif
    352 
    353     Value *Builder::VRCP(Value *va)
    354     {
    355         return FDIV(VIMMED1(1.0f), va);  // 1 / a
    356     }
    357 
    358     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
    359     {
    360         Value* vOut = FMADDPS(vA, vX, vC);
    361         vOut = FMADDPS(vB, vY, vOut);
    362         return vOut;
    363     }
    364 
    365     //////////////////////////////////////////////////////////////////////////
    366     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
    367     /// supported on the underlying platform, emulate it with float masked load
    368     /// @param src - base address pointer for the load
    369     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
    370     Value *Builder::MASKLOADD(Value* src,Value* mask)
    371     {
    372         Value* vResult;
    373         // use avx2 gather instruction is available
    374         if(JM()->mArch.AVX2())
    375         {
    376             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
    377             vResult = CALL(func,{src,mask});
    378         }
    379         else
    380         {
    381             // maskload intrinsic expects integer mask operand in llvm >= 3.8
    382     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
    383             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
    384     #else
    385             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
    386     #endif
    387             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
    388             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
    389         }
    390         return vResult;
    391     }
    392 
    393     //////////////////////////////////////////////////////////////////////////
    394     /// @brief insert a JIT call to CallPrint
    395     /// - outputs formatted string to both stdout and VS output window
    396     /// - DEBUG builds only
    397     /// Usage example:
    398     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
    399     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
    400     ///   result from a GEP, printing out the pointer to memory
    401     /// @param printStr - constant string to print, which includes format specifiers
    402     /// @param printArgs - initializer list of Value*'s to print to std out
    403     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
    404     {
    405         // push the arguments to CallPrint into a vector
    406         std::vector<Value*> printCallArgs;
    407         // save room for the format string.  we still need to modify it for vectors
    408         printCallArgs.resize(1);
    409 
    410         // search through the format string for special processing
    411         size_t pos = 0;
    412         std::string tempStr(printStr);
    413         pos = tempStr.find('%', pos);
    414         auto v = printArgs.begin();
    415 
    416         while ((pos != std::string::npos) && (v != printArgs.end()))
    417         {
    418             Value* pArg = *v;
    419             Type* pType = pArg->getType();
    420 
    421             if (pType->isVectorTy())
    422             {
    423                 Type* pContainedType = pType->getContainedType(0);
    424 
    425                 if (toupper(tempStr[pos + 1]) == 'X')
    426                 {
    427                     tempStr[pos] = '0';
    428                     tempStr[pos + 1] = 'x';
    429                     tempStr.insert(pos + 2, "%08X ");
    430                     pos += 7;
    431 
    432                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
    433 
    434                     std::string vectorFormatStr;
    435                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
    436                     {
    437                         vectorFormatStr += "0x%08X ";
    438                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
    439                     }
    440 
    441                     tempStr.insert(pos, vectorFormatStr);
    442                     pos += vectorFormatStr.size();
    443                 }
    444                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
    445                 {
    446                     uint32_t i = 0;
    447                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
    448                     {
    449                         tempStr.insert(pos, std::string("%f "));
    450                         pos += 3;
    451                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
    452                     }
    453                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
    454                 }
    455                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
    456                 {
    457                     uint32_t i = 0;
    458                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
    459                     {
    460                         tempStr.insert(pos, std::string("%d "));
    461                         pos += 3;
    462                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
    463                     }
    464                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
    465                 }
    466             }
    467             else
    468             {
    469                 if (toupper(tempStr[pos + 1]) == 'X')
    470                 {
    471                     tempStr[pos] = '0';
    472                     tempStr.insert(pos + 1, "x%08");
    473                     printCallArgs.push_back(pArg);
    474                     pos += 3;
    475                 }
    476                 // for %f we need to cast float Values to doubles so that they print out correctly
    477                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
    478                 {
    479                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
    480                     pos++;
    481                 }
    482                 else
    483                 {
    484                     printCallArgs.push_back(pArg);
    485                 }
    486             }
    487 
    488             // advance to the next arguement
    489             v++;
    490             pos = tempStr.find('%', ++pos);
    491         }
    492 
    493         // create global variable constant string
    494         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
    495         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
    496         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
    497 
    498         // get a pointer to the first character in the constant string array
    499         std::vector<Constant*> geplist{C(0),C(0)};
    500     #if HAVE_LLVM == 0x306
    501         Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
    502     #else
    503         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
    504     #endif
    505 
    506         // insert the pointer to the format string in the argument vector
    507         printCallArgs[0] = strGEP;
    508 
    509         // get pointer to CallPrint function and insert decl into the module if needed
    510         std::vector<Type*> args;
    511         args.push_back(PointerType::get(mInt8Ty,0));
    512         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
    513         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
    514 
    515         // if we haven't yet added the symbol to the symbol table
    516         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
    517         {
    518             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
    519         }
    520 
    521         // insert a call to CallPrint
    522         return CALLA(callPrintFn,printCallArgs);
    523     }
    524 
    525     //////////////////////////////////////////////////////////////////////////
    526     /// @brief Wrapper around PRINT with initializer list.
    527     CallInst* Builder::PRINT(const std::string &printStr)
    528     {
    529         return PRINT(printStr, {});
    530     }
    531 
    532     //////////////////////////////////////////////////////////////////////////
    533     /// @brief Generate a masked gather operation in LLVM IR.  If not
    534     /// supported on the underlying platform, emulate it with loads
    535     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    536     /// @param pBase - Int8* base VB address pointer value
    537     /// @param vIndices - SIMD wide value of VB byte offsets
    538     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    539     /// @param scale - value to scale indices by
    540     Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
    541     {
    542         Value* vGather;
    543 
    544         // use avx2 gather instruction if available
    545         if(JM()->mArch.AVX2())
    546         {
    547             // force mask to <N x float>, required by vgather
    548             vMask = BITCAST(vMask, mSimdFP32Ty);
    549             vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
    550         }
    551         else
    552         {
    553             Value* pStack = STACKSAVE();
    554 
    555             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
    556             Value* vSrcPtr = ALLOCA(vSrc->getType());
    557             STORE(vSrc, vSrcPtr);
    558 
    559             vGather = VUNDEF_F();
    560             Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
    561             Value *vOffsets = MUL(vIndices,vScaleVec);
    562             Value *mask = MASK(vMask);
    563             for(uint32_t i = 0; i < mVWidth; ++i)
    564             {
    565                 // single component byte index
    566                 Value *offset = VEXTRACT(vOffsets,C(i));
    567                 // byte pointer to component
    568                 Value *loadAddress = GEP(pBase,offset);
    569                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
    570                 // pointer to the value to load if we're masking off a component
    571                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
    572                 Value *selMask = VEXTRACT(mask,C(i));
    573                 // switch in a safe address to load if we're trying to access a vertex
    574                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
    575                 Value *val = LOAD(validAddress);
    576                 vGather = VINSERT(vGather,val,C(i));
    577             }
    578             STACKRESTORE(pStack);
    579         }
    580 
    581         return vGather;
    582     }
    583 
    584     //////////////////////////////////////////////////////////////////////////
    585     /// @brief Generate a masked gather operation in LLVM IR.  If not
    586     /// supported on the underlying platform, emulate it with loads
    587     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    588     /// @param pBase - Int8* base VB address pointer value
    589     /// @param vIndices - SIMD wide value of VB byte offsets
    590     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    591     /// @param scale - value to scale indices by
    592     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
    593     {
    594         Value* vGather;
    595 
    596         // use avx2 gather instruction if available
    597         if(JM()->mArch.AVX2())
    598         {
    599             vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
    600         }
    601         else
    602         {
    603             Value* pStack = STACKSAVE();
    604 
    605             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
    606             Value* vSrcPtr = ALLOCA(vSrc->getType());
    607             STORE(vSrc, vSrcPtr);
    608 
    609             vGather = VUNDEF_I();
    610             Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
    611             Value *vOffsets = MUL(vIndices, vScaleVec);
    612             Value *mask = MASK(vMask);
    613             for(uint32_t i = 0; i < mVWidth; ++i)
    614             {
    615                 // single component byte index
    616                 Value *offset = VEXTRACT(vOffsets, C(i));
    617                 // byte pointer to component
    618                 Value *loadAddress = GEP(pBase, offset);
    619                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
    620                 // pointer to the value to load if we're masking off a component
    621                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
    622                 Value *selMask = VEXTRACT(mask, C(i));
    623                 // switch in a safe address to load if we're trying to access a vertex
    624                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
    625                 Value *val = LOAD(validAddress, C(0));
    626                 vGather = VINSERT(vGather, val, C(i));
    627             }
    628 
    629             STACKRESTORE(pStack);
    630         }
    631         return vGather;
    632     }
    633 
    634     //////////////////////////////////////////////////////////////////////////
    635     /// @brief Generate a masked gather operation in LLVM IR.  If not
    636     /// supported on the underlying platform, emulate it with loads
    637     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    638     /// @param pBase - Int8* base VB address pointer value
    639     /// @param vIndices - SIMD wide value of VB byte offsets
    640     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    641     /// @param scale - value to scale indices by
    642     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
    643     {
    644         Value* vGather;
    645 
    646         // use avx2 gather instruction if available
    647         if(JM()->mArch.AVX2())
    648         {
    649             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
    650         }
    651         else
    652         {
    653             Value* pStack = STACKSAVE();
    654 
    655             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
    656             Value* vSrcPtr = ALLOCA(vSrc->getType());
    657             STORE(vSrc, vSrcPtr);
    658 
    659             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
    660             Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
    661             Value *vOffsets = MUL(vIndices,vScaleVec);
    662             Value *mask = MASK(vMask);
    663             for(uint32_t i = 0; i < mVWidth/2; ++i)
    664             {
    665                 // single component byte index
    666                 Value *offset = VEXTRACT(vOffsets,C(i));
    667                 // byte pointer to component
    668                 Value *loadAddress = GEP(pBase,offset);
    669                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
    670                 // pointer to the value to load if we're masking off a component
    671                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
    672                 Value *selMask = VEXTRACT(mask,C(i));
    673                 // switch in a safe address to load if we're trying to access a vertex
    674                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
    675                 Value *val = LOAD(validAddress);
    676                 vGather = VINSERT(vGather,val,C(i));
    677             }
    678             STACKRESTORE(pStack);
    679         }
    680         return vGather;
    681     }
    682 
    683     //////////////////////////////////////////////////////////////////////////
    684     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
    685     Value* Builder::MASK(Value* vmask)
    686     {
    687         Value* src = BITCAST(vmask, mSimdInt32Ty);
    688         return ICMP_SLT(src, VIMMED1(0));
    689     }
    690 
    691     //////////////////////////////////////////////////////////////////////////
    692     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
    693     Value* Builder::VMASK(Value* mask)
    694     {
    695         return S_EXT(mask, mSimdInt32Ty);
    696     }
    697 
    698     //////////////////////////////////////////////////////////////////////////
    699     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
    700     /// supported on the underlying platform, emulate it
    701     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
    702     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
    703     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
    704     /// 128bits of a, and vice versa for the upper lanes.  If the mask
    705     /// value is negative, '0' is inserted.
    706     Value *Builder::PSHUFB(Value* a, Value* b)
    707     {
    708         Value* res;
    709         // use avx2 pshufb instruction if available
    710         if(JM()->mArch.AVX2())
    711         {
    712             res = VPSHUFB(a, b);
    713         }
    714         else
    715         {
    716             Constant* cB = dyn_cast<Constant>(b);
    717             // number of 8 bit elements in b
    718             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
    719             // output vector
    720             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
    721 
    722             // insert an 8 bit value from the high and low lanes of a per loop iteration
    723             numElms /= 2;
    724             for(uint32_t i = 0; i < numElms; i++)
    725             {
    726                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
    727                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
    728 
    729                 // extract values from constant mask
    730                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
    731                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
    732 
    733                 Value* insertValLow128b;
    734                 Value* insertValHigh128b;
    735 
    736                 // if the mask value is negative, insert a '0' in the respective output position
    737                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
    738                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
    739                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
    740 
    741                 vShuf = VINSERT(vShuf, insertValLow128b, i);
    742                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
    743             }
    744             res = vShuf;
    745         }
    746         return res;
    747     }
    748 
    749     //////////////////////////////////////////////////////////////////////////
    750     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
    751     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
    752     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
    753     /// lower 8 values are used.
    754     Value *Builder::PMOVSXBD(Value* a)
    755     {
    756         // llvm-3.9 removed the pmovsxbd intrinsic
    757     #if HAVE_LLVM < 0x309
    758         // use avx2 byte sign extend instruction if available
    759         if(JM()->mArch.AVX2())
    760         {
    761             Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
    762             return CALL(pmovsxbd, std::initializer_list<Value*>{a});
    763         }
    764         else
    765     #endif
    766         {
    767             // VPMOVSXBD output type
    768             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
    769             // Extract 8 values from 128bit lane and sign extend
    770             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    771         }
    772     }
    773 
    774     //////////////////////////////////////////////////////////////////////////
    775     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
    776     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
    777     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
    778     Value *Builder::PMOVSXWD(Value* a)
    779     {
    780         // llvm-3.9 removed the pmovsxwd intrinsic
    781     #if HAVE_LLVM < 0x309
    782         // use avx2 word sign extend if available
    783         if(JM()->mArch.AVX2())
    784         {
    785             Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
    786             return CALL(pmovsxwd, std::initializer_list<Value*>{a});
    787         }
    788         else
    789     #endif
    790         {
    791             // VPMOVSXWD output type
    792             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
    793             // Extract 8 values from 128bit lane and sign extend
    794             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    795         }
    796     }
    797 
    798     //////////////////////////////////////////////////////////////////////////
    799     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
    800     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
    801     /// platform, emulate it
    802     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
    803     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
    804     Value *Builder::PERMD(Value* a, Value* idx)
    805     {
    806         Value* res;
    807         // use avx2 permute instruction if available
    808         if(JM()->mArch.AVX2())
    809         {
    810             res = VPERMD(a, idx);
    811         }
    812         else
    813         {
    814             if (isa<Constant>(idx))
    815             {
    816                 res = VSHUFFLE(a, a, idx);
    817             }
    818             else
    819             {
    820                 res = VUNDEF_I();
    821                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
    822                 {
    823                     Value* pIndex = VEXTRACT(idx, C(l));
    824                     Value* pVal = VEXTRACT(a, pIndex);
    825                     res = VINSERT(res, pVal, C(l));
    826                 }
    827             }
    828         }
    829         return res;
    830     }
    831 
    832     //////////////////////////////////////////////////////////////////////////
    833     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
    834     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
    835     /// platform, emulate it
    836     /// @param a - 256bit SIMD lane(8x32bit) of float values.
    837     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
    838     Value *Builder::PERMPS(Value* a, Value* idx)
    839     {
    840         Value* res;
    841         // use avx2 permute instruction if available
    842         if (JM()->mArch.AVX2())
    843         {
    844             // llvm 3.6.0 swapped the order of the args to vpermd
    845             res = VPERMPS(idx, a);
    846         }
    847         else
    848         {
    849             if (isa<Constant>(idx))
    850             {
    851                 res = VSHUFFLE(a, a, idx);
    852             }
    853             else
    854             {
    855                 res = VUNDEF_F();
    856                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
    857                 {
    858                     Value* pIndex = VEXTRACT(idx, C(l));
    859                     Value* pVal = VEXTRACT(a, pIndex);
    860                     res = VINSERT(res, pVal, C(l));
    861                 }
    862             }
    863         }
    864 
    865         return res;
    866     }
    867 
    868     //////////////////////////////////////////////////////////////////////////
    869     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
    870     /// in LLVM IR.  If not supported on the underlying platform, emulate it
    871     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
    872     Value *Builder::CVTPH2PS(Value* a)
    873     {
    874         if (JM()->mArch.F16C())
    875         {
    876             return VCVTPH2PS(a);
    877         }
    878         else
    879         {
    880             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
    881             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
    882 
    883             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
    884             {
    885                 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
    886             }
    887 
    888             Value* pResult = UndefValue::get(mSimdFP32Ty);
    889             for (uint32_t i = 0; i < mVWidth; ++i)
    890             {
    891                 Value* pSrc = VEXTRACT(a, C(i));
    892                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
    893                 pResult = VINSERT(pResult, pConv, C(i));
    894             }
    895 
    896             return pResult;
    897         }
    898     }
    899 
    900     //////////////////////////////////////////////////////////////////////////
    901     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
    902     /// in LLVM IR.  If not supported on the underlying platform, emulate it
    903     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
    904     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
    905     {
    906         if (JM()->mArch.F16C())
    907         {
    908             return VCVTPS2PH(a, rounding);
    909         }
    910         else
    911         {
    912             // call scalar C function for now
    913             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
    914             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
    915 
    916             if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
    917             {
    918                 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
    919             }
    920 
    921             Value* pResult = UndefValue::get(mSimdInt16Ty);
    922             for (uint32_t i = 0; i < mVWidth; ++i)
    923             {
    924                 Value* pSrc = VEXTRACT(a, C(i));
    925                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
    926                 pResult = VINSERT(pResult, pConv, C(i));
    927             }
    928 
    929             return pResult;
    930         }
    931     }
    932 
    933     Value *Builder::PMAXSD(Value* a, Value* b)
    934     {
    935         // llvm-3.9 removed the pmax intrinsics
    936     #if HAVE_LLVM >= 0x309
    937         Value* cmp = ICMP_SGT(a, b);
    938         return SELECT(cmp, a, b);
    939     #else
    940         if (JM()->mArch.AVX2())
    941         {
    942             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
    943             return CALL(pmaxsd, {a, b});
    944         }
    945         else
    946         {
    947             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
    948             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
    949 
    950             // low 128
    951             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
    952             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
    953             Value* resLo = CALL(pmaxsd, {aLo, bLo});
    954 
    955             // high 128
    956             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
    957             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
    958             Value* resHi = CALL(pmaxsd, {aHi, bHi});
    959 
    960             // combine
    961             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
    962             result = VINSERTI128(result, resHi, C((uint8_t)1));
    963 
    964             return result;
    965         }
    966     #endif
    967     }
    968 
    969     Value *Builder::PMINSD(Value* a, Value* b)
    970     {
    971         // llvm-3.9 removed the pmin intrinsics
    972     #if HAVE_LLVM >= 0x309
    973         Value* cmp = ICMP_SLT(a, b);
    974         return SELECT(cmp, a, b);
    975     #else
    976         if (JM()->mArch.AVX2())
    977         {
    978             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
    979             return CALL(pminsd, {a, b});
    980         }
    981         else
    982         {
    983             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
    984             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
    985 
    986             // low 128
    987             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
    988             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
    989             Value* resLo = CALL(pminsd, {aLo, bLo});
    990 
    991             // high 128
    992             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
    993             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
    994             Value* resHi = CALL(pminsd, {aHi, bHi});
    995 
    996             // combine
    997             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
    998             result = VINSERTI128(result, resHi, C((uint8_t)1));
    999 
   1000             return result;
   1001         }
   1002     #endif
   1003     }
   1004 
   1005     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
   1006                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
   1007     {
   1008         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
   1009         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
   1010         {
   1011             // ensure our mask is the correct type
   1012             mask = BITCAST(mask, mSimdFP32Ty);
   1013             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
   1014         }
   1015         else
   1016         {
   1017             // ensure our mask is the correct type
   1018             mask = BITCAST(mask, mSimdInt32Ty);
   1019             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
   1020         }
   1021     }
   1022 
   1023     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
   1024                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
   1025     {
   1026         switch(info.bpp / info.numComps)
   1027         {
   1028             case 16:
   1029             {
   1030                     Value* vGatherResult[2];
   1031                     Value *vMask;
   1032 
   1033                     // TODO: vGatherMaskedVal
   1034                     Value* vGatherMaskedVal = VIMMED1((float)0);
   1035 
   1036                     // always have at least one component out of x or y to fetch
   1037 
   1038                     // save mask as it is zero'd out after each gather
   1039                     vMask = mask;
   1040 
   1041                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
   1042                     // e.g. result of first 8x32bit integer gather for 16bit components
   1043                     // 256i - 0    1    2    3    4    5    6    7
   1044                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1045                     //
   1046 
   1047                     // if we have at least one component out of x or y to fetch
   1048                     if(info.numComps > 2)
   1049                     {
   1050                         // offset base to the next components(zw) in the vertex to gather
   1051                         pSrcBase = GEP(pSrcBase, C((char)4));
   1052                         vMask = mask;
   1053 
   1054                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
   1055                         // e.g. result of second 8x32bit integer gather for 16bit components
   1056                         // 256i - 0    1    2    3    4    5    6    7
   1057                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1058                         //
   1059                     }
   1060                     else
   1061                     {
   1062                         vGatherResult[1] =  vGatherMaskedVal;
   1063                     }
   1064 
   1065                     // Shuffle gathered components into place, each row is a component
   1066                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
   1067             }
   1068                 break;
   1069             case 32:
   1070             {
   1071                 // apply defaults
   1072                 for (uint32_t i = 0; i < 4; ++i)
   1073                 {
   1074                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
   1075                 }
   1076 
   1077                 for(uint32_t i = 0; i < info.numComps; i++)
   1078                 {
   1079                     uint32_t swizzleIndex = info.swizzle[i];
   1080 
   1081                     // save mask as it is zero'd out after each gather
   1082                     Value *vMask = mask;
   1083 
   1084                     // Gather a SIMD of components
   1085                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
   1086 
   1087                     // offset base to the next component to gather
   1088                     pSrcBase = GEP(pSrcBase, C((char)4));
   1089                 }
   1090             }
   1091                 break;
   1092             default:
   1093                 SWR_ASSERT(0, "Invalid float format");
   1094                 break;
   1095         }
   1096     }
   1097 
   1098     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
   1099                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
   1100     {
   1101         switch (info.bpp / info.numComps)
   1102         {
   1103             case 8:
   1104             {
   1105                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
   1106                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
   1107                 // e.g. result of an 8x32bit integer gather for 8bit components
   1108                 // 256i - 0    1    2    3    4    5    6    7
   1109                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
   1110 
   1111                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
   1112             }
   1113                 break;
   1114             case 16:
   1115             {
   1116                 Value* vGatherResult[2];
   1117                 Value *vMask;
   1118 
   1119                 // TODO: vGatherMaskedVal
   1120                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
   1121 
   1122                 // always have at least one component out of x or y to fetch
   1123 
   1124                 // save mask as it is zero'd out after each gather
   1125                 vMask = mask;
   1126 
   1127                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
   1128                 // e.g. result of first 8x32bit integer gather for 16bit components
   1129                 // 256i - 0    1    2    3    4    5    6    7
   1130                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1131                 //
   1132 
   1133                 // if we have at least one component out of x or y to fetch
   1134                 if(info.numComps > 2)
   1135                 {
   1136                     // offset base to the next components(zw) in the vertex to gather
   1137                     pSrcBase = GEP(pSrcBase, C((char)4));
   1138                     vMask = mask;
   1139 
   1140                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
   1141                     // e.g. result of second 8x32bit integer gather for 16bit components
   1142                     // 256i - 0    1    2    3    4    5    6    7
   1143                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1144                     //
   1145                 }
   1146                 else
   1147                 {
   1148                     vGatherResult[1] = vGatherMaskedVal;
   1149                 }
   1150 
   1151                 // Shuffle gathered components into place, each row is a component
   1152                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
   1153 
   1154             }
   1155                 break;
   1156             case 32:
   1157             {
   1158                 // apply defaults
   1159                 for (uint32_t i = 0; i < 4; ++i)
   1160                 {
   1161                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
   1162                 }
   1163 
   1164                 for(uint32_t i = 0; i < info.numComps; i++)
   1165                 {
   1166                     uint32_t swizzleIndex = info.swizzle[i];
   1167 
   1168                     // save mask as it is zero'd out after each gather
   1169                     Value *vMask = mask;
   1170 
   1171                     // Gather a SIMD of components
   1172                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
   1173 
   1174                     // offset base to the next component to gather
   1175                     pSrcBase = GEP(pSrcBase, C((char)4));
   1176                 }
   1177             }
   1178                 break;
   1179             default:
   1180                 SWR_ASSERT(0, "unsupported format");
   1181             break;
   1182         }
   1183     }
   1184 
   1185     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
   1186     {
   1187         // cast types
   1188         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   1189         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   1190 
   1191         // input could either be float or int vector; do shuffle work in int
   1192         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
   1193         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
   1194 
   1195         if(bPackedOutput)
   1196         {
   1197             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1198 
   1199             // shuffle mask
   1200             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   1201                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
   1202             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
   1203             // after pshufb: group components together in each 128bit lane
   1204             // 256i - 0    1    2    3    4    5    6    7
   1205             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
   1206 
   1207             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
   1208             // after PERMD: move and pack xy components into each 128bit lane
   1209             // 256i - 0    1    2    3    4    5    6    7
   1210             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
   1211 
   1212             // do the same for zw components
   1213             Value* vi128ZW = nullptr;
   1214             if(info.numComps > 2)
   1215             {
   1216                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
   1217                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
   1218             }
   1219 
   1220             for(uint32_t i = 0; i < 4; i++)
   1221             {
   1222                 uint32_t swizzleIndex = info.swizzle[i];
   1223                 // todo: fixed for packed
   1224                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
   1225                 if(i >= info.numComps)
   1226                 {
   1227                     // set the default component val
   1228                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
   1229                     continue;
   1230                 }
   1231 
   1232                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   1233                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   1234                 // if x or y, use vi128XY permute result, else use vi128ZW
   1235                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   1236 
   1237                 // extract packed component 128 bit lanes
   1238                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
   1239             }
   1240 
   1241         }
   1242         else
   1243         {
   1244             // pshufb masks for each component
   1245             Value* vConstMask[2];
   1246             // x/z shuffle mask
   1247             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
   1248                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
   1249 
   1250             // y/w shuffle mask
   1251             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
   1252                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
   1253 
   1254 
   1255             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
   1256             // apply defaults
   1257             for (uint32_t i = 0; i < 4; ++i)
   1258             {
   1259                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
   1260             }
   1261 
   1262             for(uint32_t i = 0; i < info.numComps; i++)
   1263             {
   1264                 uint32_t swizzleIndex = info.swizzle[i];
   1265 
   1266                 // select correct constMask for x/z or y/w pshufb
   1267                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
   1268                 // if x or y, use vi128XY permute result, else use vi128ZW
   1269                 uint32_t selectedGather = (i < 2) ? 0 : 1;
   1270 
   1271                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
   1272                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
   1273                 // 256i - 0    1    2    3    4    5    6    7
   1274                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
   1275             }
   1276         }
   1277     }
   1278 
   1279     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
   1280     {
   1281         // cast types
   1282         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   1283         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
   1284 
   1285         if(bPackedOutput)
   1286         {
   1287             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1288             // shuffle mask
   1289             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
   1290                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
   1291             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
   1292             // after pshufb: group components together in each 128bit lane
   1293             // 256i - 0    1    2    3    4    5    6    7
   1294             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
   1295 
   1296             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
   1297             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
   1298             // 256i - 0    1    2    3    4    5    6    7
   1299             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
   1300 
   1301             // do the same for zw components
   1302             Value* vi128ZW = nullptr;
   1303             if(info.numComps > 2)
   1304             {
   1305                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
   1306             }
   1307 
   1308             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   1309             for(uint32_t i = 0; i < 4; i++)
   1310             {
   1311                 uint32_t swizzleIndex = info.swizzle[i];
   1312                 // todo: fix for packed
   1313                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
   1314                 if(i >= info.numComps)
   1315                 {
   1316                     // set the default component val
   1317                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
   1318                     continue;
   1319                 }
   1320 
   1321                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   1322                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   1323                 // if x or y, use vi128XY permute result, else use vi128ZW
   1324                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   1325 
   1326                 // sign extend
   1327                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
   1328             }
   1329         }
   1330         // else zero extend
   1331         else{
   1332             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
   1333             // apply defaults
   1334             for (uint32_t i = 0; i < 4; ++i)
   1335             {
   1336                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
   1337             }
   1338 
   1339             for(uint32_t i = 0; i < info.numComps; i++){
   1340                 uint32_t swizzleIndex = info.swizzle[i];
   1341 
   1342                 // pshufb masks for each component
   1343                 Value* vConstMask;
   1344                 switch(i)
   1345                 {
   1346                     case 0:
   1347                         // x shuffle mask
   1348                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
   1349                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
   1350                         break;
   1351                     case 1:
   1352                         // y shuffle mask
   1353                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
   1354                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
   1355                         break;
   1356                     case 2:
   1357                         // z shuffle mask
   1358                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
   1359                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
   1360                         break;
   1361                     case 3:
   1362                         // w shuffle mask
   1363                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
   1364                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
   1365                         break;
   1366                     default:
   1367                         vConstMask = nullptr;
   1368                         break;
   1369                 }
   1370 
   1371                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
   1372                     // after pshufb for x channel
   1373                     // 256i - 0    1    2    3    4    5    6    7
   1374                     //        x000 x000 x000 x000 x000 x000 x000 x000
   1375             }
   1376         }
   1377     }
   1378 
   1379     // Helper function to create alloca in entry block of function
   1380     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
   1381     {
   1382         auto saveIP = IRB()->saveIP();
   1383         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
   1384                               pFunc->getEntryBlock().begin());
   1385         Value* pAlloca = ALLOCA(pType);
   1386         IRB()->restoreIP(saveIP);
   1387         return pAlloca;
   1388     }
   1389 
   1390     //////////////////////////////////////////////////////////////////////////
   1391     /// @brief emulates a scatter operation.
   1392     /// @param pDst - pointer to destination
   1393     /// @param vSrc - vector of src data to scatter
   1394     /// @param vOffsets - vector of byte offsets from pDst
   1395     /// @param vMask - mask of valid lanes
   1396     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
   1397     {
   1398         /* Scatter algorithm
   1399 
   1400            while(Index = BitScanForward(mask))
   1401                 srcElem = srcVector[Index]
   1402                 offsetElem = offsetVector[Index]
   1403                 *(pDst + offsetElem) = srcElem
   1404                 Update mask (&= ~(1<<Index)
   1405 
   1406         */
   1407 
   1408         BasicBlock* pCurBB = IRB()->GetInsertBlock();
   1409         Function* pFunc = pCurBB->getParent();
   1410         Type* pSrcTy = vSrc->getType()->getVectorElementType();
   1411 
   1412         // Store vectors on stack
   1413         if (pScatterStackSrc == nullptr)
   1414         {
   1415             // Save off stack allocations and reuse per scatter. Significantly reduces stack
   1416             // requirements for shaders with a lot of scatters.
   1417             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
   1418             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
   1419         }
   1420 
   1421         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
   1422         Value* pOffsetsArrayPtr = pScatterStackOffsets;
   1423         STORE(vSrc, pSrcArrayPtr);
   1424         STORE(vOffsets, pOffsetsArrayPtr);
   1425 
   1426         // Cast to pointers for random access
   1427         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
   1428         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
   1429 
   1430         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
   1431 
   1432         // Get cttz function
   1433         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
   1434 
   1435         // Setup loop basic block
   1436         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
   1437 
   1438         // compute first set bit
   1439         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
   1440 
   1441         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
   1442 
   1443         // Split current block
   1444         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
   1445 
   1446         // Remove unconditional jump created by splitBasicBlock
   1447         pCurBB->getTerminator()->eraseFromParent();
   1448 
   1449         // Add terminator to end of original block
   1450         IRB()->SetInsertPoint(pCurBB);
   1451 
   1452         // Add conditional branch
   1453         COND_BR(pIsUndef, pPostLoop, pLoop);
   1454 
   1455         // Add loop basic block contents
   1456         IRB()->SetInsertPoint(pLoop);
   1457         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
   1458         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
   1459 
   1460         pIndexPhi->addIncoming(pIndex, pCurBB);
   1461         pMaskPhi->addIncoming(pMask, pCurBB);
   1462 
   1463         // Extract elements for this index
   1464         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
   1465         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
   1466 
   1467         // GEP to this offset in dst
   1468         Value* pCurDst = GEP(pDst, pOffsetElem);
   1469         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
   1470         STORE(pSrcElem, pCurDst);
   1471 
   1472         // Update the mask
   1473         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
   1474 
   1475         // Terminator
   1476         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
   1477 
   1478         pIsUndef = ICMP_EQ(pNewIndex, C(32));
   1479         COND_BR(pIsUndef, pPostLoop, pLoop);
   1480 
   1481         // Update phi edges
   1482         pIndexPhi->addIncoming(pNewIndex, pLoop);
   1483         pMaskPhi->addIncoming(pNewMask, pLoop);
   1484 
   1485         // Move builder to beginning of post loop
   1486         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
   1487     }
   1488 
   1489     Value* Builder::VABSPS(Value* a)
   1490     {
   1491         Value* asInt = BITCAST(a, mSimdInt32Ty);
   1492         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
   1493         return result;
   1494     }
   1495 
   1496     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
   1497     {
   1498         Value *lowCmp = ICMP_SLT(src, low);
   1499         Value *ret = SELECT(lowCmp, low, src);
   1500 
   1501         Value *highCmp = ICMP_SGT(ret, high);
   1502         ret = SELECT(highCmp, high, ret);
   1503 
   1504         return ret;
   1505     }
   1506 
   1507     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
   1508     {
   1509         Value *lowCmp = FCMP_OLT(src, low);
   1510         Value *ret = SELECT(lowCmp, low, src);
   1511 
   1512         Value *highCmp = FCMP_OGT(ret, high);
   1513         ret = SELECT(highCmp, high, ret);
   1514 
   1515         return ret;
   1516     }
   1517 
   1518     Value *Builder::FCLAMP(Value* src, float low, float high)
   1519     {
   1520         Value* result = VMAXPS(src, VIMMED1(low));
   1521         result = VMINPS(result, VIMMED1(high));
   1522 
   1523         return result;
   1524     }
   1525 
   1526     //////////////////////////////////////////////////////////////////////////
   1527     /// @brief save/restore stack, providing ability to push/pop the stack and
   1528     ///        reduce overall stack requirements for temporary stack use
   1529     Value* Builder::STACKSAVE()
   1530     {
   1531         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
   1532     #if HAVE_LLVM == 0x306
   1533         return CALL(pfnStackSave);
   1534     #else
   1535         return CALLA(pfnStackSave);
   1536     #endif
   1537     }
   1538 
   1539     void Builder::STACKRESTORE(Value* pSaved)
   1540     {
   1541         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
   1542         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
   1543     }
   1544 
   1545     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
   1546     {
   1547         Value* vOut;
   1548         // use FMADs if available
   1549         if(JM()->mArch.AVX2())
   1550         {
   1551             vOut = VFMADDPS(a, b, c);
   1552         }
   1553         else
   1554         {
   1555             vOut = FADD(FMUL(a, b), c);
   1556         }
   1557         return vOut;
   1558     }
   1559 
   1560     Value* Builder::POPCNT(Value* a)
   1561     {
   1562         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
   1563         return CALL(pCtPop, std::initializer_list<Value*>{a});
   1564     }
   1565 
   1566     //////////////////////////////////////////////////////////////////////////
   1567     /// @brief C functions called by LLVM IR
   1568     //////////////////////////////////////////////////////////////////////////
   1569 
   1570     //////////////////////////////////////////////////////////////////////////
   1571     /// @brief called in JIT code, inserted by PRINT
   1572     /// output to both stdout and visual studio debug console
   1573     void __cdecl CallPrint(const char* fmt, ...)
   1574     {
   1575         va_list args;
   1576         va_start(args, fmt);
   1577         vprintf(fmt, args);
   1578 
   1579     #if defined( _WIN32 )
   1580         char strBuf[1024];
   1581         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
   1582         OutputDebugString(strBuf);
   1583     #endif
   1584 
   1585         va_end(args);
   1586     }
   1587 
   1588     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
   1589     {
   1590     #if HAVE_LLVM == 0x306
   1591         Function *func =
   1592             Intrinsic::getDeclaration(JM()->mpCurrentModule,
   1593                                       Intrinsic::x86_avx_vextractf128_si_256);
   1594         return CALL(func, {a, imm8});
   1595     #else
   1596         bool flag = !imm8->isZeroValue();
   1597         SmallVector<Constant*,8> idx;
   1598         for (unsigned i = 0; i < mVWidth / 2; i++) {
   1599             idx.push_back(C(flag ? i + mVWidth / 2 : i));
   1600         }
   1601         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
   1602     #endif
   1603     }
   1604 
   1605     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
   1606     {
   1607     #if HAVE_LLVM == 0x306
   1608         Function *func =
   1609             Intrinsic::getDeclaration(JM()->mpCurrentModule,
   1610                                       Intrinsic::x86_avx_vinsertf128_si_256);
   1611         return CALL(func, {a, b, imm8});
   1612     #else
   1613         bool flag = !imm8->isZeroValue();
   1614         SmallVector<Constant*,8> idx;
   1615         for (unsigned i = 0; i < mVWidth; i++) {
   1616             idx.push_back(C(i));
   1617         }
   1618         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
   1619 
   1620         SmallVector<Constant*,8> idx2;
   1621         for (unsigned i = 0; i < mVWidth / 2; i++) {
   1622             idx2.push_back(C(flag ? i : i + mVWidth));
   1623         }
   1624         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
   1625             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
   1626         }
   1627         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
   1628     #endif
   1629     }
   1630 
   1631     // rdtsc buckets macros
   1632     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
   1633     {
   1634         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
   1635         // buckets framework when single threaded
   1636         if (KNOB_SINGLE_THREADED)
   1637         {
   1638             std::vector<Type*> args{
   1639                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
   1640                 mInt32Ty                        // id
   1641             };
   1642 
   1643             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
   1644             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
   1645             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
   1646             {
   1647                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
   1648             }
   1649 
   1650             CALL(pFunc, { pBucketMgr, pId });
   1651         }
   1652     }
   1653 
   1654     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
   1655     {
   1656         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
   1657         // buckets framework when single threaded
   1658         if (KNOB_SINGLE_THREADED)
   1659         {
   1660             std::vector<Type*> args{
   1661                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
   1662                 mInt32Ty                        // id
   1663             };
   1664 
   1665             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
   1666             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
   1667             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
   1668             {
   1669                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
   1670             }
   1671 
   1672             CALL(pFunc, { pBucketMgr, pId });
   1673         }
   1674     }
   1675 
   1676 }
   1677