Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file builder_misc.cpp
     24 *
     25 * @brief Implementation for miscellaneous builder functions
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "jit_pch.hpp"
     31 #include "builder.h"
     32 #include "common/rdtsc_buckets.h"
     33 
     34 #include <cstdarg>
     35 
     36 namespace SwrJit
     37 {
     38     void __cdecl CallPrint(const char* fmt, ...);
     39 
     40     //////////////////////////////////////////////////////////////////////////
     41     /// @brief Convert an IEEE 754 32-bit single precision float to an
     42     ///        16 bit float with 5 exponent bits and a variable
     43     ///        number of mantissa bits.
     44     /// @param val - 32-bit float
     45     /// @todo Maybe move this outside of this file into a header?
     46     static uint16_t ConvertFloat32ToFloat16(float val)
     47     {
     48         uint32_t sign, exp, mant;
     49         uint32_t roundBits;
     50 
     51         // Extract the sign, exponent, and mantissa
     52         uint32_t uf = *(uint32_t*)&val;
     53         sign = (uf & 0x80000000) >> 31;
     54         exp = (uf & 0x7F800000) >> 23;
     55         mant = uf & 0x007FFFFF;
     56 
     57         // Check for out of range
     58         if (std::isnan(val))
     59         {
     60             exp = 0x1F;
     61             mant = 0x200;
     62             sign = 1;                     // set the sign bit for NANs
     63         }
     64         else if (std::isinf(val))
     65         {
     66             exp = 0x1f;
     67             mant = 0x0;
     68         }
     69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
     70         {
     71             exp = 0x1E;
     72             mant = 0x3FF;
     73         }
     74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
     75         {
     76             mant |= 0x00800000;
     77             for (; exp <= 0x70; mant >>= 1, exp++)
     78                 ;
     79             exp = 0;
     80             mant = mant >> 13;
     81         }
     82         else if (exp < 0x66) // Too small to represent -> Zero
     83         {
     84             exp = 0;
     85             mant = 0;
     86         }
     87         else
     88         {
     89             // Saves bits that will be shifted off for rounding
     90             roundBits = mant & 0x1FFFu;
     91             // convert exponent and mantissa to 16 bit format
     92             exp = exp - 0x70;
     93             mant = mant >> 13;
     94 
     95             // Essentially RTZ, but round up if off by only 1 lsb
     96             if (roundBits == 0x1FFFu)
     97             {
     98                 mant++;
     99                 // check for overflow
    100                 if ((mant & 0xC00u) != 0)
    101                     exp++;
    102                 // make sure only the needed bits are used
    103                 mant &= 0x3FF;
    104             }
    105         }
    106 
    107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
    108         return (uint16_t)tmpVal;
    109     }
    110 
    111     //////////////////////////////////////////////////////////////////////////
    112     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
    113     ///        float
    114     /// @param val - 16-bit float
    115     /// @todo Maybe move this outside of this file into a header?
    116     static float ConvertFloat16ToFloat32(uint32_t val)
    117     {
    118         uint32_t result;
    119         if ((val & 0x7fff) == 0)
    120         {
    121             result = ((uint32_t)(val & 0x8000)) << 16;
    122         }
    123         else if ((val & 0x7c00) == 0x7c00)
    124         {
    125             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
    126             result |= ((uint32_t)val & 0x8000) << 16;
    127         }
    128         else
    129         {
    130             uint32_t sign = (val & 0x8000) << 16;
    131             uint32_t mant = (val & 0x3ff) << 13;
    132             uint32_t exp = (val >> 10) & 0x1f;
    133             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
    134             {
    135                 mant <<= 1;
    136                 while (mant < (0x400 << 13))
    137                 {
    138                     exp--;
    139                     mant <<= 1;
    140                 }
    141                 mant &= (0x3ff << 13);
    142             }
    143             exp = ((exp - 15 + 127) & 0xff) << 23;
    144             result = sign | exp | mant;
    145         }
    146 
    147         return *(float*)&result;
    148     }
    149 
    150     Constant *Builder::C(bool i)
    151     {
    152         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
    153     }
    154 
    155     Constant *Builder::C(char i)
    156     {
    157         return ConstantInt::get(IRB()->getInt8Ty(), i);
    158     }
    159 
    160     Constant *Builder::C(uint8_t i)
    161     {
    162         return ConstantInt::get(IRB()->getInt8Ty(), i);
    163     }
    164 
    165     Constant *Builder::C(int i)
    166     {
    167         return ConstantInt::get(IRB()->getInt32Ty(), i);
    168     }
    169 
    170     Constant *Builder::C(int64_t i)
    171     {
    172         return ConstantInt::get(IRB()->getInt64Ty(), i);
    173     }
    174 
    175     Constant *Builder::C(uint16_t i)
    176     {
    177         return ConstantInt::get(mInt16Ty,i);
    178     }
    179 
    180     Constant *Builder::C(uint32_t i)
    181     {
    182         return ConstantInt::get(IRB()->getInt32Ty(), i);
    183     }
    184 
    185     Constant *Builder::C(float i)
    186     {
    187         return ConstantFP::get(IRB()->getFloatTy(), i);
    188     }
    189 
    190     Constant *Builder::PRED(bool pred)
    191     {
    192         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
    193     }
    194 
    195     Value *Builder::VIMMED1(int i)
    196     {
    197         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    198     }
    199 
    200     Value *Builder::VIMMED1_16(int i)
    201     {
    202         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
    203     }
    204 
    205     Value *Builder::VIMMED1(uint32_t i)
    206     {
    207         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    208     }
    209 
    210     Value *Builder::VIMMED1_16(uint32_t i)
    211     {
    212         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
    213     }
    214 
    215     Value *Builder::VIMMED1(float i)
    216     {
    217         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
    218     }
    219 
    220     Value *Builder::VIMMED1_16(float i)
    221     {
    222         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
    223     }
    224 
    225     Value *Builder::VIMMED1(bool i)
    226     {
    227         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
    228     }
    229 
    230     Value *Builder::VIMMED1_16(bool i)
    231     {
    232         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
    233     }
    234 
    235     Value *Builder::VUNDEF_IPTR()
    236     {
    237         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
    238     }
    239 
    240     Value *Builder::VUNDEF(Type* t)
    241     {
    242         return UndefValue::get(VectorType::get(t, mVWidth));
    243     }
    244 
    245     Value *Builder::VUNDEF_I()
    246     {
    247         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
    248     }
    249 
    250     Value *Builder::VUNDEF_I_16()
    251     {
    252         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
    253     }
    254 
    255     Value *Builder::VUNDEF_F()
    256     {
    257         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
    258     }
    259 
    260     Value *Builder::VUNDEF_F_16()
    261     {
    262         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
    263     }
    264 
    265     Value *Builder::VUNDEF(Type *ty, uint32_t size)
    266     {
    267         return UndefValue::get(VectorType::get(ty, size));
    268     }
    269 
    270     Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
    271     {
    272         // check if src is already a vector
    273         if (src->getType()->isVectorTy())
    274         {
    275             return src;
    276         }
    277 
    278         return VECTOR_SPLAT(mVWidth, src, name);
    279     }
    280 
    281     Value *Builder::VBROADCAST_16(Value *src)
    282     {
    283         // check if src is already a vector
    284         if (src->getType()->isVectorTy())
    285         {
    286             return src;
    287         }
    288 
    289         return VECTOR_SPLAT(mVWidth16, src);
    290     }
    291 
    292     uint32_t Builder::IMMED(Value* v)
    293     {
    294         SWR_ASSERT(isa<ConstantInt>(v));
    295         ConstantInt *pValConst = cast<ConstantInt>(v);
    296         return pValConst->getZExtValue();
    297     }
    298 
    299     int32_t Builder::S_IMMED(Value* v)
    300     {
    301         SWR_ASSERT(isa<ConstantInt>(v));
    302         ConstantInt *pValConst = cast<ConstantInt>(v);
    303         return pValConst->getSExtValue();
    304     }
    305 
    306     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
    307     {
    308         std::vector<Value*> indices;
    309         for (auto i : indexList)
    310             indices.push_back(i);
    311         return GEPA(ptr, indices);
    312     }
    313 
    314     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
    315     {
    316         std::vector<Value*> indices;
    317         for (auto i : indexList)
    318             indices.push_back(C(i));
    319         return GEPA(ptr, indices);
    320     }
    321 
    322     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
    323     {
    324         std::vector<Value*> indices;
    325         for (auto i : indexList)
    326             indices.push_back(i);
    327         return IN_BOUNDS_GEP(ptr, indices);
    328     }
    329 
    330     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
    331     {
    332         std::vector<Value*> indices;
    333         for (auto i : indexList)
    334             indices.push_back(C(i));
    335         return IN_BOUNDS_GEP(ptr, indices);
    336     }
    337 
    338     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
    339     {
    340         std::vector<Value*> valIndices;
    341         for (auto i : indices)
    342             valIndices.push_back(C(i));
    343         return LOAD(GEPA(basePtr, valIndices), name);
    344     }
    345 
    346     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
    347     {
    348         std::vector<Value*> valIndices;
    349         for (auto i : indices)
    350             valIndices.push_back(i);
    351         return LOAD(GEPA(basePtr, valIndices), name);
    352     }
    353 
    354     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
    355     {
    356         std::vector<Value*> valIndices;
    357         for (auto i : indices)
    358             valIndices.push_back(C(i));
    359         return STORE(val, GEPA(basePtr, valIndices));
    360     }
    361 
    362     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
    363     {
    364         std::vector<Value*> valIndices;
    365         for (auto i : indices)
    366             valIndices.push_back(i);
    367         return STORE(val, GEPA(basePtr, valIndices));
    368     }
    369 
    370     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
    371     {
    372         std::vector<Value*> args;
    373         for (auto arg : argsList)
    374             args.push_back(arg);
    375         return CALLA(Callee, args, name);
    376     }
    377 
    378     CallInst *Builder::CALL(Value *Callee, Value* arg)
    379     {
    380         std::vector<Value*> args;
    381         args.push_back(arg);
    382         return CALLA(Callee, args);
    383     }
    384 
    385     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
    386     {
    387         std::vector<Value*> args;
    388         args.push_back(arg1);
    389         args.push_back(arg2);
    390         return CALLA(Callee, args);
    391     }
    392 
    393     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
    394     {
    395         std::vector<Value*> args;
    396         args.push_back(arg1);
    397         args.push_back(arg2);
    398         args.push_back(arg3);
    399         return CALLA(Callee, args);
    400     }
    401 
    402     //////////////////////////////////////////////////////////////////////////
    403     Value *Builder::DEBUGTRAP()
    404     {
    405         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
    406         return CALL(func);
    407     }
    408 
    409     Value *Builder::VRCP(Value *va, const llvm::Twine& name)
    410     {
    411         return FDIV(VIMMED1(1.0f), va, name);  // 1 / a
    412     }
    413 
    414     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
    415     {
    416         Value* vOut = FMADDPS(vA, vX, vC);
    417         vOut = FMADDPS(vB, vY, vOut);
    418         return vOut;
    419     }
    420 
    421     //////////////////////////////////////////////////////////////////////////
    422     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
    423     /// supported on the underlying platform, emulate it with float masked load
    424     /// @param src - base address pointer for the load
    425     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
    426     Value *Builder::MASKLOADD(Value* src,Value* mask)
    427     {
    428         Value* vResult;
    429         // use avx2 gather instruction is available
    430         if(JM()->mArch.AVX2())
    431         {
    432             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
    433             vResult = CALL(func,{src,mask});
    434         }
    435         else
    436         {
    437             // maskload intrinsic expects integer mask operand in llvm >= 3.8
    438     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
    439             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
    440     #else
    441             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
    442     #endif
    443             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
    444             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
    445         }
    446         return vResult;
    447     }
    448 
    449     //////////////////////////////////////////////////////////////////////////
    450     /// @brief insert a JIT call to CallPrint
    451     /// - outputs formatted string to both stdout and VS output window
    452     /// - DEBUG builds only
    453     /// Usage example:
    454     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
    455     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
    456     ///   result from a GEP, printing out the pointer to memory
    457     /// @param printStr - constant string to print, which includes format specifiers
    458     /// @param printArgs - initializer list of Value*'s to print to std out
    459     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
    460     {
    461         // push the arguments to CallPrint into a vector
    462         std::vector<Value*> printCallArgs;
    463         // save room for the format string.  we still need to modify it for vectors
    464         printCallArgs.resize(1);
    465 
    466         // search through the format string for special processing
    467         size_t pos = 0;
    468         std::string tempStr(printStr);
    469         pos = tempStr.find('%', pos);
    470         auto v = printArgs.begin();
    471 
    472         while ((pos != std::string::npos) && (v != printArgs.end()))
    473         {
    474             Value* pArg = *v;
    475             Type* pType = pArg->getType();
    476 
    477             if (pType->isVectorTy())
    478             {
    479                 Type* pContainedType = pType->getContainedType(0);
    480 
    481                 if (toupper(tempStr[pos + 1]) == 'X')
    482                 {
    483                     tempStr[pos] = '0';
    484                     tempStr[pos + 1] = 'x';
    485                     tempStr.insert(pos + 2, "%08X ");
    486                     pos += 7;
    487 
    488                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
    489 
    490                     std::string vectorFormatStr;
    491                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
    492                     {
    493                         vectorFormatStr += "0x%08X ";
    494                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
    495                     }
    496 
    497                     tempStr.insert(pos, vectorFormatStr);
    498                     pos += vectorFormatStr.size();
    499                 }
    500                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
    501                 {
    502                     uint32_t i = 0;
    503                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
    504                     {
    505                         tempStr.insert(pos, std::string("%f "));
    506                         pos += 3;
    507                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
    508                     }
    509                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
    510                 }
    511                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
    512                 {
    513                     uint32_t i = 0;
    514                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
    515                     {
    516                         tempStr.insert(pos, std::string("%d "));
    517                         pos += 3;
    518                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
    519                     }
    520                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
    521                 }
    522             }
    523             else
    524             {
    525                 if (toupper(tempStr[pos + 1]) == 'X')
    526                 {
    527                     tempStr[pos] = '0';
    528                     tempStr.insert(pos + 1, "x%08");
    529                     printCallArgs.push_back(pArg);
    530                     pos += 3;
    531                 }
    532                 // for %f we need to cast float Values to doubles so that they print out correctly
    533                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
    534                 {
    535                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
    536                     pos++;
    537                 }
    538                 else
    539                 {
    540                     printCallArgs.push_back(pArg);
    541                 }
    542             }
    543 
    544             // advance to the next arguement
    545             v++;
    546             pos = tempStr.find('%', ++pos);
    547         }
    548 
    549         // create global variable constant string
    550         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
    551         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
    552         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
    553 
    554         // get a pointer to the first character in the constant string array
    555         std::vector<Constant*> geplist{C(0),C(0)};
    556         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
    557 
    558         // insert the pointer to the format string in the argument vector
    559         printCallArgs[0] = strGEP;
    560 
    561         // get pointer to CallPrint function and insert decl into the module if needed
    562         std::vector<Type*> args;
    563         args.push_back(PointerType::get(mInt8Ty,0));
    564         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
    565         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
    566 
    567         // if we haven't yet added the symbol to the symbol table
    568         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
    569         {
    570             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
    571         }
    572 
    573         // insert a call to CallPrint
    574         return CALLA(callPrintFn,printCallArgs);
    575     }
    576 
    577     //////////////////////////////////////////////////////////////////////////
    578     /// @brief Wrapper around PRINT with initializer list.
    579     CallInst* Builder::PRINT(const std::string &printStr)
    580     {
    581         return PRINT(printStr, {});
    582     }
    583 
    584     //////////////////////////////////////////////////////////////////////////
    585     /// @brief Generate a masked gather operation in LLVM IR.  If not
    586     /// supported on the underlying platform, emulate it with loads
    587     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    588     /// @param pBase - Int8* base VB address pointer value
    589     /// @param vIndices - SIMD wide value of VB byte offsets
    590     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    591     /// @param scale - value to scale indices by
    592     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
    593     {
    594         Value *vGather;
    595 
    596         // use avx2 gather instruction if available
    597         if(JM()->mArch.AVX2())
    598         {
    599             // force mask to <N x float>, required by vgather
    600             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
    601 
    602             vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
    603         }
    604         else
    605         {
    606             Value* pStack = STACKSAVE();
    607 
    608             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
    609             Value* vSrcPtr = ALLOCA(vSrc->getType());
    610             STORE(vSrc, vSrcPtr);
    611 
    612             vGather = VUNDEF_F();
    613             Value *vScaleVec = VIMMED1((uint32_t)scale);
    614             Value *vOffsets = MUL(vIndices,vScaleVec);
    615             for(uint32_t i = 0; i < mVWidth; ++i)
    616             {
    617                 // single component byte index
    618                 Value *offset = VEXTRACT(vOffsets,C(i));
    619                 // byte pointer to component
    620                 Value *loadAddress = GEP(pBase,offset);
    621                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
    622                 // pointer to the value to load if we're masking off a component
    623                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
    624                 Value *selMask = VEXTRACT(vMask,C(i));
    625                 // switch in a safe address to load if we're trying to access a vertex
    626                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
    627                 Value *val = LOAD(validAddress);
    628                 vGather = VINSERT(vGather,val,C(i));
    629             }
    630 
    631             STACKRESTORE(pStack);
    632         }
    633 
    634         return vGather;
    635     }
    636 
    637     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
    638     {
    639         Value *vGather = VUNDEF_F_16();
    640 
    641         // use AVX512F gather instruction if available
    642         if (JM()->mArch.AVX512F())
    643         {
    644             // force mask to <N-bit Integer>, required by vgather2
    645             Value *mask = BITCAST(vMask, mInt16Ty);
    646 
    647             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
    648         }
    649         else
    650         {
    651             Value *src0 = EXTRACT_16(vSrc, 0);
    652             Value *src1 = EXTRACT_16(vSrc, 1);
    653 
    654             Value *indices0 = EXTRACT_16(vIndices, 0);
    655             Value *indices1 = EXTRACT_16(vIndices, 1);
    656 
    657             Value *mask0 = EXTRACT_16(vMask, 0);
    658             Value *mask1 = EXTRACT_16(vMask, 1);
    659 
    660             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
    661             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
    662 
    663             vGather = JOIN_16(gather0, gather1);
    664         }
    665 
    666         return vGather;
    667     }
    668 
    669     //////////////////////////////////////////////////////////////////////////
    670     /// @brief Generate a masked gather operation in LLVM IR.  If not
    671     /// supported on the underlying platform, emulate it with loads
    672     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    673     /// @param pBase - Int8* base VB address pointer value
    674     /// @param vIndices - SIMD wide value of VB byte offsets
    675     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    676     /// @param scale - value to scale indices by
    677     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
    678     {
    679         Value* vGather;
    680 
    681         // use avx2 gather instruction if available
    682         if(JM()->mArch.AVX2())
    683         {
    684             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
    685         }
    686         else
    687         {
    688             Value* pStack = STACKSAVE();
    689 
    690             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
    691             Value* vSrcPtr = ALLOCA(vSrc->getType());
    692             STORE(vSrc, vSrcPtr);
    693 
    694             vGather = VUNDEF_I();
    695             Value *vScaleVec = VIMMED1((uint32_t)scale);
    696             Value *vOffsets = MUL(vIndices, vScaleVec);
    697             for(uint32_t i = 0; i < mVWidth; ++i)
    698             {
    699                 // single component byte index
    700                 Value *offset = VEXTRACT(vOffsets, C(i));
    701                 // byte pointer to component
    702                 Value *loadAddress = GEP(pBase, offset);
    703                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
    704                 // pointer to the value to load if we're masking off a component
    705                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
    706                 Value *selMask = VEXTRACT(vMask, C(i));
    707                 // switch in a safe address to load if we're trying to access a vertex
    708                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
    709                 Value *val = LOAD(validAddress, C(0));
    710                 vGather = VINSERT(vGather, val, C(i));
    711             }
    712 
    713             STACKRESTORE(pStack);
    714         }
    715 
    716         return vGather;
    717     }
    718 
    719     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
    720     {
    721         Value *vGather = VUNDEF_I_16();
    722 
    723         // use AVX512F gather instruction if available
    724         if (JM()->mArch.AVX512F())
    725         {
    726             // force mask to <N-bit Integer>, required by vgather2
    727             Value *mask = BITCAST(vMask, mInt16Ty);
    728 
    729             vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
    730         }
    731         else
    732         {
    733             Value *src0 = EXTRACT_16(vSrc, 0);
    734             Value *src1 = EXTRACT_16(vSrc, 1);
    735 
    736             Value *indices0 = EXTRACT_16(vIndices, 0);
    737             Value *indices1 = EXTRACT_16(vIndices, 1);
    738 
    739             Value *mask0 = EXTRACT_16(vMask, 0);
    740             Value *mask1 = EXTRACT_16(vMask, 1);
    741 
    742             Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
    743             Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
    744 
    745             vGather = JOIN_16(gather0, gather1);
    746         }
    747 
    748         return vGather;
    749     }
    750 
    751     //////////////////////////////////////////////////////////////////////////
    752     /// @brief Generate a masked gather operation in LLVM IR.  If not
    753     /// supported on the underlying platform, emulate it with loads
    754     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
    755     /// @param pBase - Int8* base VB address pointer value
    756     /// @param vIndices - SIMD wide value of VB byte offsets
    757     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
    758     /// @param scale - value to scale indices by
    759     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
    760     {
    761         Value* vGather;
    762 
    763         // use avx2 gather instruction if available
    764         if(JM()->mArch.AVX2())
    765         {
    766             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
    767             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
    768         }
    769         else
    770         {
    771             Value* pStack = STACKSAVE();
    772 
    773             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
    774             Value* vSrcPtr = ALLOCA(vSrc->getType());
    775             STORE(vSrc, vSrcPtr);
    776 
    777             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
    778             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
    779             Value *vOffsets = MUL(vIndices,vScaleVec);
    780             for(uint32_t i = 0; i < mVWidth/2; ++i)
    781             {
    782                 // single component byte index
    783                 Value *offset = VEXTRACT(vOffsets,C(i));
    784                 // byte pointer to component
    785                 Value *loadAddress = GEP(pBase,offset);
    786                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
    787                 // pointer to the value to load if we're masking off a component
    788                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
    789                 Value *selMask = VEXTRACT(vMask,C(i));
    790                 // switch in a safe address to load if we're trying to access a vertex
    791                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
    792                 Value *val = LOAD(validAddress);
    793                 vGather = VINSERT(vGather,val,C(i));
    794             }
    795             STACKRESTORE(pStack);
    796         }
    797         return vGather;
    798     }
    799 
    800     Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
    801     {
    802         if (imm == 0)
    803         {
    804             return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
    805         }
    806         else
    807         {
    808             return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
    809         }
    810     }
    811 
    812     Value *Builder::JOIN_16(Value *a, Value *b)
    813     {
    814         return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
    815     }
    816 
    817     //////////////////////////////////////////////////////////////////////////
    818     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
    819     Value *Builder::MASK(Value *vmask)
    820     {
    821         Value *src = BITCAST(vmask, mSimdInt32Ty);
    822         return ICMP_SLT(src, VIMMED1(0));
    823     }
    824 
    825     Value *Builder::MASK_16(Value *vmask)
    826     {
    827         Value *src = BITCAST(vmask, mSimd16Int32Ty);
    828         return ICMP_SLT(src, VIMMED1_16(0));
    829     }
    830 
    831     //////////////////////////////////////////////////////////////////////////
    832     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
    833     Value *Builder::VMASK(Value *mask)
    834     {
    835         return S_EXT(mask, mSimdInt32Ty);
    836     }
    837 
    838     Value *Builder::VMASK_16(Value *mask)
    839     {
    840         return S_EXT(mask, mSimd16Int32Ty);
    841     }
    842 
    843     //////////////////////////////////////////////////////////////////////////
    844     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
    845     /// supported on the underlying platform, emulate it
    846     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
    847     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
    848     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
    849     /// 128bits of a, and vice versa for the upper lanes.  If the mask
    850     /// value is negative, '0' is inserted.
    851     Value *Builder::PSHUFB(Value* a, Value* b)
    852     {
    853         Value* res;
    854         // use avx2 pshufb instruction if available
    855         if(JM()->mArch.AVX2())
    856         {
    857             res = VPSHUFB(a, b);
    858         }
    859         else
    860         {
    861             Constant* cB = dyn_cast<Constant>(b);
    862             // number of 8 bit elements in b
    863             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
    864             // output vector
    865             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
    866 
    867             // insert an 8 bit value from the high and low lanes of a per loop iteration
    868             numElms /= 2;
    869             for(uint32_t i = 0; i < numElms; i++)
    870             {
    871                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
    872                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
    873 
    874                 // extract values from constant mask
    875                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
    876                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
    877 
    878                 Value* insertValLow128b;
    879                 Value* insertValHigh128b;
    880 
    881                 // if the mask value is negative, insert a '0' in the respective output position
    882                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
    883                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
    884                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
    885 
    886                 vShuf = VINSERT(vShuf, insertValLow128b, i);
    887                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
    888             }
    889             res = vShuf;
    890         }
    891         return res;
    892     }
    893 
    894     //////////////////////////////////////////////////////////////////////////
    895     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
    896     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
    897     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
    898     /// lower 8 values are used.
    899     Value *Builder::PMOVSXBD(Value* a)
    900     {
    901         // VPMOVSXBD output type
    902         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
    903         // Extract 8 values from 128bit lane and sign extend
    904         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    905     }
    906 
    907     //////////////////////////////////////////////////////////////////////////
    908     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
    909     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
    910     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
    911     Value *Builder::PMOVSXWD(Value* a)
    912     {
    913         // VPMOVSXWD output type
    914         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
    915         // Extract 8 values from 128bit lane and sign extend
    916         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
    917     }
    918 
    919     //////////////////////////////////////////////////////////////////////////
    920     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
    921     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
    922     /// platform, emulate it
    923     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
    924     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
    925     Value *Builder::PERMD(Value* a, Value* idx)
    926     {
    927         Value* res;
    928         // use avx2 permute instruction if available
    929         if(JM()->mArch.AVX2())
    930         {
    931             res = VPERMD(a, idx);
    932         }
    933         else
    934         {
    935             if (isa<Constant>(idx))
    936             {
    937                 res = VSHUFFLE(a, a, idx);
    938             }
    939             else
    940             {
    941                 res = VUNDEF_I();
    942                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
    943                 {
    944                     Value* pIndex = VEXTRACT(idx, C(l));
    945                     Value* pVal = VEXTRACT(a, pIndex);
    946                     res = VINSERT(res, pVal, C(l));
    947                 }
    948             }
    949         }
    950         return res;
    951     }
    952 
    953     //////////////////////////////////////////////////////////////////////////
    954     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
    955     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
    956     /// platform, emulate it
    957     /// @param a - 256bit SIMD lane(8x32bit) of float values.
    958     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
    959     Value *Builder::PERMPS(Value* a, Value* idx)
    960     {
    961         Value* res;
    962         // use avx2 permute instruction if available
    963         if (JM()->mArch.AVX2())
    964         {
    965             // llvm 3.6.0 swapped the order of the args to vpermd
    966             res = VPERMPS(idx, a);
    967         }
    968         else
    969         {
    970             if (isa<Constant>(idx))
    971             {
    972                 res = VSHUFFLE(a, a, idx);
    973             }
    974             else
    975             {
    976                 res = VUNDEF_F();
    977                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
    978                 {
    979                     Value* pIndex = VEXTRACT(idx, C(l));
    980                     Value* pVal = VEXTRACT(a, pIndex);
    981                     res = VINSERT(res, pVal, C(l));
    982                 }
    983             }
    984         }
    985 
    986         return res;
    987     }
    988 
    989     //////////////////////////////////////////////////////////////////////////
    990     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
    991     /// in LLVM IR.  If not supported on the underlying platform, emulate it
    992     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
    993     Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
    994     {
    995         if (JM()->mArch.F16C())
    996         {
    997             return VCVTPH2PS(a, name);
    998         }
    999         else
   1000         {
   1001             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
   1002             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
   1003 
   1004             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
   1005             {
   1006                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
   1007             }
   1008 
   1009             Value* pResult = UndefValue::get(mSimdFP32Ty);
   1010             for (uint32_t i = 0; i < mVWidth; ++i)
   1011             {
   1012                 Value* pSrc = VEXTRACT(a, C(i));
   1013                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
   1014                 pResult = VINSERT(pResult, pConv, C(i));
   1015             }
   1016 
   1017             pResult->setName(name);
   1018             return pResult;
   1019         }
   1020     }
   1021 
   1022     //////////////////////////////////////////////////////////////////////////
   1023     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
   1024     /// in LLVM IR.  If not supported on the underlying platform, emulate it
   1025     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
   1026     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
   1027     {
   1028         if (JM()->mArch.F16C())
   1029         {
   1030             return VCVTPS2PH(a, rounding);
   1031         }
   1032         else
   1033         {
   1034             // call scalar C function for now
   1035             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
   1036             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
   1037 
   1038             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
   1039             {
   1040                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
   1041             }
   1042 
   1043             Value* pResult = UndefValue::get(mSimdInt16Ty);
   1044             for (uint32_t i = 0; i < mVWidth; ++i)
   1045             {
   1046                 Value* pSrc = VEXTRACT(a, C(i));
   1047                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
   1048                 pResult = VINSERT(pResult, pConv, C(i));
   1049             }
   1050 
   1051             return pResult;
   1052         }
   1053     }
   1054 
   1055     Value *Builder::PMAXSD(Value* a, Value* b)
   1056     {
   1057         Value* cmp = ICMP_SGT(a, b);
   1058         return SELECT(cmp, a, b);
   1059     }
   1060 
   1061     Value *Builder::PMINSD(Value* a, Value* b)
   1062     {
   1063         Value* cmp = ICMP_SLT(a, b);
   1064         return SELECT(cmp, a, b);
   1065     }
   1066 
   1067     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
   1068                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
   1069     {
   1070         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
   1071         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
   1072         {
   1073             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
   1074         }
   1075         else
   1076         {
   1077             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
   1078         }
   1079     }
   1080 
   1081     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
   1082                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
   1083     {
   1084         switch(info.bpp / info.numComps)
   1085         {
   1086             case 16:
   1087             {
   1088                     Value* vGatherResult[2];
   1089 
   1090                     // TODO: vGatherMaskedVal
   1091                     Value* vGatherMaskedVal = VIMMED1((float)0);
   1092 
   1093                     // always have at least one component out of x or y to fetch
   1094 
   1095                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
   1096                     // e.g. result of first 8x32bit integer gather for 16bit components
   1097                     // 256i - 0    1    2    3    4    5    6    7
   1098                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1099                     //
   1100 
   1101                     // if we have at least one component out of x or y to fetch
   1102                     if(info.numComps > 2)
   1103                     {
   1104                         // offset base to the next components(zw) in the vertex to gather
   1105                         pSrcBase = GEP(pSrcBase, C((char)4));
   1106 
   1107                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
   1108                         // e.g. result of second 8x32bit integer gather for 16bit components
   1109                         // 256i - 0    1    2    3    4    5    6    7
   1110                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1111                         //
   1112                     }
   1113                     else
   1114                     {
   1115                         vGatherResult[1] =  vGatherMaskedVal;
   1116                     }
   1117 
   1118                     // Shuffle gathered components into place, each row is a component
   1119                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
   1120             }
   1121                 break;
   1122             case 32:
   1123             {
   1124                 // apply defaults
   1125                 for (uint32_t i = 0; i < 4; ++i)
   1126                 {
   1127                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
   1128                 }
   1129 
   1130                 for(uint32_t i = 0; i < info.numComps; i++)
   1131                 {
   1132                     uint32_t swizzleIndex = info.swizzle[i];
   1133 
   1134                     // Gather a SIMD of components
   1135                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
   1136 
   1137                     // offset base to the next component to gather
   1138                     pSrcBase = GEP(pSrcBase, C((char)4));
   1139                 }
   1140             }
   1141                 break;
   1142             default:
   1143                 SWR_INVALID("Invalid float format");
   1144                 break;
   1145         }
   1146     }
   1147 
   1148     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
   1149                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
   1150     {
   1151         switch (info.bpp / info.numComps)
   1152         {
   1153             case 8:
   1154             {
   1155                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
   1156                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
   1157                 // e.g. result of an 8x32bit integer gather for 8bit components
   1158                 // 256i - 0    1    2    3    4    5    6    7
   1159                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
   1160 
   1161                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
   1162             }
   1163                 break;
   1164             case 16:
   1165             {
   1166                 Value* vGatherResult[2];
   1167 
   1168                 // TODO: vGatherMaskedVal
   1169                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
   1170 
   1171                 // always have at least one component out of x or y to fetch
   1172 
   1173                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
   1174                 // e.g. result of first 8x32bit integer gather for 16bit components
   1175                 // 256i - 0    1    2    3    4    5    6    7
   1176                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1177                 //
   1178 
   1179                 // if we have at least one component out of x or y to fetch
   1180                 if(info.numComps > 2)
   1181                 {
   1182                     // offset base to the next components(zw) in the vertex to gather
   1183                     pSrcBase = GEP(pSrcBase, C((char)4));
   1184 
   1185                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
   1186                     // e.g. result of second 8x32bit integer gather for 16bit components
   1187                     // 256i - 0    1    2    3    4    5    6    7
   1188                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1189                     //
   1190                 }
   1191                 else
   1192                 {
   1193                     vGatherResult[1] = vGatherMaskedVal;
   1194                 }
   1195 
   1196                 // Shuffle gathered components into place, each row is a component
   1197                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
   1198 
   1199             }
   1200                 break;
   1201             case 32:
   1202             {
   1203                 // apply defaults
   1204                 for (uint32_t i = 0; i < 4; ++i)
   1205                 {
   1206                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
   1207                 }
   1208 
   1209                 for(uint32_t i = 0; i < info.numComps; i++)
   1210                 {
   1211                     uint32_t swizzleIndex = info.swizzle[i];
   1212 
   1213                     // Gather a SIMD of components
   1214                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
   1215 
   1216                     // offset base to the next component to gather
   1217                     pSrcBase = GEP(pSrcBase, C((char)4));
   1218                 }
   1219             }
   1220                 break;
   1221             default:
   1222                 SWR_INVALID("unsupported format");
   1223             break;
   1224         }
   1225     }
   1226 
   1227     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
   1228     {
   1229         // cast types
   1230         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   1231         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   1232 
   1233         // input could either be float or int vector; do shuffle work in int
   1234         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
   1235         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
   1236 
   1237         if(bPackedOutput)
   1238         {
   1239             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1240 
   1241             // shuffle mask
   1242             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   1243                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
   1244             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
   1245             // after pshufb: group components together in each 128bit lane
   1246             // 256i - 0    1    2    3    4    5    6    7
   1247             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
   1248 
   1249             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
   1250             // after PERMD: move and pack xy components into each 128bit lane
   1251             // 256i - 0    1    2    3    4    5    6    7
   1252             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
   1253 
   1254             // do the same for zw components
   1255             Value* vi128ZW = nullptr;
   1256             if(info.numComps > 2)
   1257             {
   1258                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
   1259                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
   1260             }
   1261 
   1262             for(uint32_t i = 0; i < 4; i++)
   1263             {
   1264                 uint32_t swizzleIndex = info.swizzle[i];
   1265                 // todo: fixed for packed
   1266                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
   1267                 if(i >= info.numComps)
   1268                 {
   1269                     // set the default component val
   1270                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
   1271                     continue;
   1272                 }
   1273 
   1274                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   1275                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   1276                 // if x or y, use vi128XY permute result, else use vi128ZW
   1277                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   1278 
   1279                 // extract packed component 128 bit lanes
   1280                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
   1281             }
   1282 
   1283         }
   1284         else
   1285         {
   1286             // pshufb masks for each component
   1287             Value* vConstMask[2];
   1288             // x/z shuffle mask
   1289             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
   1290                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
   1291 
   1292             // y/w shuffle mask
   1293             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
   1294                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
   1295 
   1296 
   1297             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
   1298             // apply defaults
   1299             for (uint32_t i = 0; i < 4; ++i)
   1300             {
   1301                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
   1302             }
   1303 
   1304             for(uint32_t i = 0; i < info.numComps; i++)
   1305             {
   1306                 uint32_t swizzleIndex = info.swizzle[i];
   1307 
   1308                 // select correct constMask for x/z or y/w pshufb
   1309                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
   1310                 // if x or y, use vi128XY permute result, else use vi128ZW
   1311                 uint32_t selectedGather = (i < 2) ? 0 : 1;
   1312 
   1313                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
   1314                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
   1315                 // 256i - 0    1    2    3    4    5    6    7
   1316                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
   1317             }
   1318         }
   1319     }
   1320 
   1321     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
   1322     {
   1323         // cast types
   1324         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   1325         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
   1326 
   1327         if(bPackedOutput)
   1328         {
   1329             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1330             // shuffle mask
   1331             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
   1332                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
   1333             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
   1334             // after pshufb: group components together in each 128bit lane
   1335             // 256i - 0    1    2    3    4    5    6    7
   1336             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
   1337 
   1338             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
   1339             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
   1340             // 256i - 0    1    2    3    4    5    6    7
   1341             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
   1342 
   1343             // do the same for zw components
   1344             Value* vi128ZW = nullptr;
   1345             if(info.numComps > 2)
   1346             {
   1347                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
   1348             }
   1349 
   1350             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   1351             for(uint32_t i = 0; i < 4; i++)
   1352             {
   1353                 uint32_t swizzleIndex = info.swizzle[i];
   1354                 // todo: fix for packed
   1355                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
   1356                 if(i >= info.numComps)
   1357                 {
   1358                     // set the default component val
   1359                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
   1360                     continue;
   1361                 }
   1362 
   1363                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   1364                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   1365                 // if x or y, use vi128XY permute result, else use vi128ZW
   1366                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   1367 
   1368                 // sign extend
   1369                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
   1370             }
   1371         }
   1372         // else zero extend
   1373         else{
   1374             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
   1375             // apply defaults
   1376             for (uint32_t i = 0; i < 4; ++i)
   1377             {
   1378                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
   1379             }
   1380 
   1381             for(uint32_t i = 0; i < info.numComps; i++){
   1382                 uint32_t swizzleIndex = info.swizzle[i];
   1383 
   1384                 // pshufb masks for each component
   1385                 Value* vConstMask;
   1386                 switch(i)
   1387                 {
   1388                     case 0:
   1389                         // x shuffle mask
   1390                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
   1391                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
   1392                         break;
   1393                     case 1:
   1394                         // y shuffle mask
   1395                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
   1396                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
   1397                         break;
   1398                     case 2:
   1399                         // z shuffle mask
   1400                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
   1401                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
   1402                         break;
   1403                     case 3:
   1404                         // w shuffle mask
   1405                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
   1406                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
   1407                         break;
   1408                     default:
   1409                         vConstMask = nullptr;
   1410                         break;
   1411                 }
   1412 
   1413                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
   1414                     // after pshufb for x channel
   1415                     // 256i - 0    1    2    3    4    5    6    7
   1416                     //        x000 x000 x000 x000 x000 x000 x000 x000
   1417             }
   1418         }
   1419     }
   1420 
   1421     // Helper function to create alloca in entry block of function
   1422     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
   1423     {
   1424         auto saveIP = IRB()->saveIP();
   1425         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
   1426                               pFunc->getEntryBlock().begin());
   1427         Value* pAlloca = ALLOCA(pType);
   1428         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
   1429         return pAlloca;
   1430     }
   1431 
   1432     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
   1433     {
   1434         auto saveIP = IRB()->saveIP();
   1435         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
   1436             pFunc->getEntryBlock().begin());
   1437         Value* pAlloca = ALLOCA(pType, pArraySize);
   1438         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
   1439         return pAlloca;
   1440     }
   1441 
   1442     //////////////////////////////////////////////////////////////////////////
   1443     /// @brief emulates a scatter operation.
   1444     /// @param pDst - pointer to destination
   1445     /// @param vSrc - vector of src data to scatter
   1446     /// @param vOffsets - vector of byte offsets from pDst
   1447     /// @param vMask - mask of valid lanes
   1448     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
   1449     {
   1450         /* Scatter algorithm
   1451 
   1452            while(Index = BitScanForward(mask))
   1453                 srcElem = srcVector[Index]
   1454                 offsetElem = offsetVector[Index]
   1455                 *(pDst + offsetElem) = srcElem
   1456                 Update mask (&= ~(1<<Index)
   1457 
   1458         */
   1459 
   1460         BasicBlock* pCurBB = IRB()->GetInsertBlock();
   1461         Function* pFunc = pCurBB->getParent();
   1462         Type* pSrcTy = vSrc->getType()->getVectorElementType();
   1463 
   1464         // Store vectors on stack
   1465         if (pScatterStackSrc == nullptr)
   1466         {
   1467             // Save off stack allocations and reuse per scatter. Significantly reduces stack
   1468             // requirements for shaders with a lot of scatters.
   1469             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
   1470             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
   1471         }
   1472 
   1473         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
   1474         Value* pOffsetsArrayPtr = pScatterStackOffsets;
   1475         STORE(vSrc, pSrcArrayPtr);
   1476         STORE(vOffsets, pOffsetsArrayPtr);
   1477 
   1478         // Cast to pointers for random access
   1479         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
   1480         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
   1481 
   1482         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
   1483 
   1484         // Get cttz function
   1485         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
   1486 
   1487         // Setup loop basic block
   1488         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
   1489 
   1490         // compute first set bit
   1491         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
   1492 
   1493         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
   1494 
   1495         // Split current block
   1496         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
   1497 
   1498         // Remove unconditional jump created by splitBasicBlock
   1499         pCurBB->getTerminator()->eraseFromParent();
   1500 
   1501         // Add terminator to end of original block
   1502         IRB()->SetInsertPoint(pCurBB);
   1503 
   1504         // Add conditional branch
   1505         COND_BR(pIsUndef, pPostLoop, pLoop);
   1506 
   1507         // Add loop basic block contents
   1508         IRB()->SetInsertPoint(pLoop);
   1509         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
   1510         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
   1511 
   1512         pIndexPhi->addIncoming(pIndex, pCurBB);
   1513         pMaskPhi->addIncoming(pMask, pCurBB);
   1514 
   1515         // Extract elements for this index
   1516         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
   1517         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
   1518 
   1519         // GEP to this offset in dst
   1520         Value* pCurDst = GEP(pDst, pOffsetElem);
   1521         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
   1522         STORE(pSrcElem, pCurDst);
   1523 
   1524         // Update the mask
   1525         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
   1526 
   1527         // Terminator
   1528         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
   1529 
   1530         pIsUndef = ICMP_EQ(pNewIndex, C(32));
   1531         COND_BR(pIsUndef, pPostLoop, pLoop);
   1532 
   1533         // Update phi edges
   1534         pIndexPhi->addIncoming(pNewIndex, pLoop);
   1535         pMaskPhi->addIncoming(pNewMask, pLoop);
   1536 
   1537         // Move builder to beginning of post loop
   1538         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
   1539     }
   1540 
   1541     Value* Builder::VABSPS(Value* a)
   1542     {
   1543         Value* asInt = BITCAST(a, mSimdInt32Ty);
   1544         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
   1545         return result;
   1546     }
   1547 
   1548     Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
   1549     {
   1550         Value *lowCmp = ICMP_SLT(src, low);
   1551         Value *ret = SELECT(lowCmp, low, src);
   1552 
   1553         Value *highCmp = ICMP_SGT(ret, high);
   1554         ret = SELECT(highCmp, high, ret, name);
   1555 
   1556         return ret;
   1557     }
   1558 
   1559     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
   1560     {
   1561         Value *lowCmp = FCMP_OLT(src, low);
   1562         Value *ret = SELECT(lowCmp, low, src);
   1563 
   1564         Value *highCmp = FCMP_OGT(ret, high);
   1565         ret = SELECT(highCmp, high, ret);
   1566 
   1567         return ret;
   1568     }
   1569 
   1570     Value *Builder::FCLAMP(Value* src, float low, float high)
   1571     {
   1572         Value* result = VMAXPS(src, VIMMED1(low));
   1573         result = VMINPS(result, VIMMED1(high));
   1574 
   1575         return result;
   1576     }
   1577 
   1578     //////////////////////////////////////////////////////////////////////////
   1579     /// @brief save/restore stack, providing ability to push/pop the stack and
   1580     ///        reduce overall stack requirements for temporary stack use
   1581     Value* Builder::STACKSAVE()
   1582     {
   1583         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
   1584         return CALLA(pfnStackSave);
   1585     }
   1586 
   1587     void Builder::STACKRESTORE(Value* pSaved)
   1588     {
   1589         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
   1590         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
   1591     }
   1592 
   1593     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
   1594     {
   1595         Value* vOut;
   1596         // use FMADs if available
   1597         if(JM()->mArch.AVX2())
   1598         {
   1599             vOut = VFMADDPS(a, b, c);
   1600         }
   1601         else
   1602         {
   1603             vOut = FADD(FMUL(a, b), c);
   1604         }
   1605         return vOut;
   1606     }
   1607 
   1608     Value* Builder::POPCNT(Value* a)
   1609     {
   1610         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
   1611         return CALL(pCtPop, std::initializer_list<Value*>{a});
   1612     }
   1613 
   1614     //////////////////////////////////////////////////////////////////////////
   1615     /// @brief C functions called by LLVM IR
   1616     //////////////////////////////////////////////////////////////////////////
   1617 
   1618     //////////////////////////////////////////////////////////////////////////
   1619     /// @brief called in JIT code, inserted by PRINT
   1620     /// output to both stdout and visual studio debug console
   1621     void __cdecl CallPrint(const char* fmt, ...)
   1622     {
   1623         va_list args;
   1624         va_start(args, fmt);
   1625         vprintf(fmt, args);
   1626 
   1627     #if defined( _WIN32 )
   1628         char strBuf[1024];
   1629         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
   1630         OutputDebugStringA(strBuf);
   1631     #endif
   1632 
   1633         va_end(args);
   1634     }
   1635 
   1636     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
   1637     {
   1638         bool flag = !imm8->isZeroValue();
   1639         SmallVector<Constant*,8> idx;
   1640         for (unsigned i = 0; i < mVWidth / 2; i++) {
   1641             idx.push_back(C(flag ? i + mVWidth / 2 : i));
   1642         }
   1643         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
   1644     }
   1645 
   1646     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
   1647     {
   1648         bool flag = !imm8->isZeroValue();
   1649         SmallVector<Constant*,8> idx;
   1650         for (unsigned i = 0; i < mVWidth; i++) {
   1651             idx.push_back(C(i));
   1652         }
   1653         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
   1654 
   1655         SmallVector<Constant*,8> idx2;
   1656         for (unsigned i = 0; i < mVWidth / 2; i++) {
   1657             idx2.push_back(C(flag ? i : i + mVWidth));
   1658         }
   1659         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
   1660             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
   1661         }
   1662         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
   1663     }
   1664 
   1665     // rdtsc buckets macros
   1666     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
   1667     {
   1668         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
   1669         // buckets framework when single threaded
   1670         if (KNOB_SINGLE_THREADED)
   1671         {
   1672             std::vector<Type*> args{
   1673                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
   1674                 mInt32Ty                        // id
   1675             };
   1676 
   1677             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
   1678             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
   1679             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
   1680             {
   1681                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
   1682             }
   1683 
   1684             CALL(pFunc, { pBucketMgr, pId });
   1685         }
   1686     }
   1687 
   1688     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
   1689     {
   1690         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
   1691         // buckets framework when single threaded
   1692         if (KNOB_SINGLE_THREADED)
   1693         {
   1694             std::vector<Type*> args{
   1695                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
   1696                 mInt32Ty                        // id
   1697             };
   1698 
   1699             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
   1700             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
   1701             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
   1702             {
   1703                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
   1704             }
   1705 
   1706             CALL(pFunc, { pBucketMgr, pId });
   1707         }
   1708     }
   1709 
   1710 
   1711     uint32_t Builder::GetTypeSize(Type* pType)
   1712     {
   1713         if (pType->isStructTy())
   1714         {
   1715             uint32_t numElems = pType->getStructNumElements();
   1716             Type* pElemTy = pType->getStructElementType(0);
   1717             return numElems * GetTypeSize(pElemTy);
   1718         }
   1719 
   1720         if (pType->isArrayTy())
   1721         {
   1722             uint32_t numElems = pType->getArrayNumElements();
   1723             Type* pElemTy = pType->getArrayElementType();
   1724             return numElems * GetTypeSize(pElemTy);
   1725         }
   1726 
   1727         if (pType->isIntegerTy())
   1728         {
   1729             uint32_t bitSize = pType->getIntegerBitWidth();
   1730             return bitSize / 8;
   1731         }
   1732 
   1733         if (pType->isFloatTy())
   1734         {
   1735             return 4;
   1736         }
   1737 
   1738         if (pType->isHalfTy())
   1739         {
   1740             return 2;
   1741         }
   1742 
   1743         if (pType->isDoubleTy())
   1744         {
   1745             return 8;
   1746         }
   1747 
   1748         SWR_ASSERT(false, "Unimplemented type.");
   1749         return 0;
   1750     }
   1751 }
   1752