Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file fetch_jit.cpp
     24 *
     25 * @brief Implementation of the fetch jitter
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "jit_pch.hpp"
     31 #include "builder.h"
     32 #include "jit_api.h"
     33 #include "fetch_jit.h"
     34 #include "gen_state_llvm.h"
     35 
     36 //#define FETCH_DUMP_VERTEX 1
     37 using namespace llvm;
     38 using namespace SwrJit;
     39 
     40 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
     41 
     42 enum ConversionType
     43 {
     44     CONVERT_NONE,
     45     CONVERT_NORMALIZED,
     46     CONVERT_USCALED,
     47     CONVERT_SSCALED,
     48     CONVERT_SFIXED,
     49 };
     50 
     51 #if USE_SIMD16_SHADERS
     52 #define USE_SIMD16_GATHERS 0
     53 #endif
     54 
     55 //////////////////////////////////////////////////////////////////////////
     56 /// Interface to Jitting a fetch shader
     57 //////////////////////////////////////////////////////////////////////////
     58 struct FetchJit : public Builder
     59 {
     60     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
     61 
     62     Function* Create(const FETCH_COMPILE_STATE& fetchState);
     63 
     64     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
     65     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
     66     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
     67 
     68     // package up Shuffle*bpcGatherd args into a tuple for convenience
     69     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
     70         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
     71         const uint32_t(&)[4]> Shuffle8bpcArgs;
     72 
     73 #if USE_SIMD16_SHADERS
     74 #if USE_SIMD16_GATHERS
     75     void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
     76 #else
     77     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
     78 #endif
     79 #else
     80     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
     81 #endif
     82 
     83     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
     84         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
     85 
     86 #if USE_SIMD16_SHADERS
     87 #if USE_SIMD16_GATHERS
     88     void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
     89 #else
     90     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
     91 #endif
     92 #else
     93     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
     94 #endif
     95 
     96 #if USE_SIMD16_GATHERS
     97     void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
     98 #else
     99     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
    100 #endif
    101 
    102 #if USE_SIMD16_SHADERS
    103 #if USE_SIMD16_GATHERS
    104     Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
    105 #else
    106     Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
    107 #endif
    108 #else
    109     Value *GenerateCompCtrlVector(const ComponentControl ctrl);
    110 #endif
    111 
    112     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
    113 
    114 #if USE_SIMD16_SHADERS
    115 #if USE_SIMD16_GATHERS
    116     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
    117 #else
    118     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
    119 #endif
    120 #else
    121     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
    122 #endif
    123 
    124     bool IsOddFormat(SWR_FORMAT format);
    125     bool IsUniformFormat(SWR_FORMAT format);
    126     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
    127     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
    128     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
    129 
    130     Value* mpPrivateContext;
    131     Value* mpFetchInfo;
    132 };
    133 
    134 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
    135 {
    136     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
    137     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
    138 
    139     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
    140     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
    141 
    142     fetch->getParent()->setModuleIdentifier(fetch->getName());
    143 
    144     IRB()->SetInsertPoint(entry);
    145 
    146     auto    argitr = fetch->arg_begin();
    147 
    148     // Fetch shader arguments
    149     mpPrivateContext = &*argitr; ++argitr;
    150     mpPrivateContext->setName("privateContext");
    151 
    152     mpFetchInfo = &*argitr; ++argitr;
    153     mpFetchInfo->setName("fetchInfo");
    154     Value*    pVtxOut = &*argitr;
    155     pVtxOut->setName("vtxOutput");
    156     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
    157     // index 0(just the pointer to the simdvertex structure
    158     // index 1(which element of the simdvertex structure to offset to(in this case 0)
    159     // so the indices being i32's doesn't matter
    160     // TODO: generated this GEP with a VECTOR structure type so this makes sense
    161     std::vector<Value*>    vtxInputIndices(2, C(0));
    162     // GEP
    163     pVtxOut = GEP(pVtxOut, C(0));
    164 #if USE_SIMD16_SHADERS
    165 #if 0// USE_SIMD16_BUILDER
    166     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
    167 #else
    168     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
    169 #endif
    170 #else
    171     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
    172 #endif
    173 
    174     // SWR_FETCH_CONTEXT::pStreams
    175     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
    176     streams->setName("pStreams");
    177 
    178     // SWR_FETCH_CONTEXT::pIndices
    179     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
    180     indices->setName("pIndices");
    181 
    182     // SWR_FETCH_CONTEXT::pLastIndex
    183     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
    184     pLastIndex->setName("pLastIndex");
    185 
    186 
    187     Value* vIndices;
    188 #if USE_SIMD16_SHADERS
    189     Value* indices2;
    190     Value* vIndices2;
    191 #endif
    192     switch(fetchState.indexType)
    193     {
    194         case R8_UINT:
    195             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
    196 #if USE_SIMD16_SHADERS
    197             indices2 = GEP(indices, C(8));
    198 #endif
    199             if(fetchState.bDisableIndexOOBCheck)
    200             {
    201                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
    202                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
    203 #if USE_SIMD16_SHADERS
    204                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
    205                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
    206 #endif
    207             }
    208             else
    209             {
    210                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
    211                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
    212 #if USE_SIMD16_SHADERS
    213                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
    214                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
    215 #endif
    216             }
    217             break;
    218         case R16_UINT:
    219             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
    220 #if USE_SIMD16_SHADERS
    221             indices2 = GEP(indices, C(8));
    222 #endif
    223             if(fetchState.bDisableIndexOOBCheck)
    224             {
    225                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
    226                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
    227 #if USE_SIMD16_SHADERS
    228                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
    229                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
    230 #endif
    231             }
    232             else
    233             {
    234                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
    235                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
    236 #if USE_SIMD16_SHADERS
    237                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
    238                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
    239 #endif
    240             }
    241             break;
    242         case R32_UINT:
    243 #if USE_SIMD16_SHADERS
    244             indices2 = GEP(indices, C(8));
    245 #endif
    246             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
    247                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
    248 #if USE_SIMD16_SHADERS
    249             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
    250                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
    251 #endif
    252             break; // incoming type is already 32bit int
    253         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
    254     }
    255 
    256     if(fetchState.bForceSequentialAccessEnable)
    257     {
    258         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
    259 
    260         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
    261         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
    262         vIndices = ADD(vIndices, pOffsets);
    263 #if USE_SIMD16_SHADERS
    264         vIndices2 = ADD(vIndices, VIMMED1(8));
    265 #endif
    266     }
    267 
    268     Value* vVertexId = vIndices;
    269 #if USE_SIMD16_SHADERS
    270     Value* vVertexId2 = vIndices2;
    271 #endif
    272     if (fetchState.bVertexIDOffsetEnable)
    273     {
    274         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
    275         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
    276         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
    277         vVertexId = ADD(vIndices, vBaseVertex);
    278         vVertexId = ADD(vVertexId, vStartVertex);
    279 #if USE_SIMD16_SHADERS
    280         vVertexId2 = ADD(vIndices2, vBaseVertex);
    281         vVertexId2 = ADD(vVertexId2, vStartVertex);
    282 #endif
    283     }
    284 
    285     // store out vertex IDs
    286     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
    287 #if USE_SIMD16_SHADERS
    288     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
    289 #endif
    290 
    291     // store out cut mask if enabled
    292     if (fetchState.bEnableCutIndex)
    293     {
    294         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
    295         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
    296         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
    297 #if USE_SIMD16_SHADERS
    298         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
    299         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
    300 #endif
    301     }
    302 
    303     // Fetch attributes from memory and output to a simdvertex struct
    304     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
    305 #if USE_SIMD16_SHADERS
    306     if (fetchState.bDisableVGATHER)
    307     {
    308         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
    309         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
    310     }
    311     else
    312     {
    313 #if USE_SIMD16_GATHERS
    314         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
    315 #else
    316         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
    317         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
    318 #endif
    319     }
    320 #else
    321     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
    322                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
    323 #endif
    324 
    325     RET_VOID();
    326 
    327     JitManager::DumpToFile(fetch, "src");
    328 
    329 #if defined(_DEBUG)
    330     verifyFunction(*fetch);
    331 #endif
    332 
    333     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
    334 
    335     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
    336     setupPasses.add(createBreakCriticalEdgesPass());
    337     setupPasses.add(createCFGSimplificationPass());
    338     setupPasses.add(createEarlyCSEPass());
    339     setupPasses.add(createPromoteMemoryToRegisterPass());
    340 
    341     setupPasses.run(*fetch);
    342 
    343     JitManager::DumpToFile(fetch, "se");
    344 
    345     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
    346 
    347     ///@todo Haven't touched these either. Need to remove some of these and add others.
    348     optPasses.add(createCFGSimplificationPass());
    349     optPasses.add(createEarlyCSEPass());
    350     optPasses.add(createInstructionCombiningPass());
    351     optPasses.add(createInstructionSimplifierPass());
    352     optPasses.add(createConstantPropagationPass());
    353     optPasses.add(createSCCPPass());
    354     optPasses.add(createAggressiveDCEPass());
    355 
    356     optPasses.run(*fetch);
    357     optPasses.run(*fetch);
    358 
    359     JitManager::DumpToFile(fetch, "opt");
    360 
    361 
    362     return fetch;
    363 }
    364 
    365 //////////////////////////////////////////////////////////////////////////
    366 /// @brief Loads attributes from memory using LOADs, shuffling the
    367 /// components into SOA form.
    368 /// *Note* currently does not support component control,
    369 /// component packing, instancing
    370 /// @param fetchState - info about attributes to be fetched from memory
    371 /// @param streams - value pointer to the current vertex stream
    372 /// @param vIndices - vector value of indices to load
    373 /// @param pVtxOut - value pointer to output simdvertex struct
    374 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
    375 {
    376     // Zack shuffles; a variant of the Charleston.
    377 
    378     std::vector<Value*> vectors(16);
    379     std::vector<Constant*>    pMask(mVWidth);
    380     for(uint32_t i = 0; i < mVWidth; ++i)
    381     {
    382         pMask[i] = (C(i < 4 ? i : 4));
    383     }
    384     Constant* promoteMask = ConstantVector::get(pMask);
    385     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
    386 
    387     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
    388     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
    389     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
    390     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
    391     curInstance->setName("curInstance");
    392 
    393     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
    394     {
    395         Value*    elements[4] = {0};
    396         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
    397         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
    398         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
    399         uint32_t    numComponents = info.numComps;
    400         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
    401 
    402         // load path doesn't support component packing
    403         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
    404 
    405         vectors.clear();
    406 
    407         if (fetchState.bInstanceIDOffsetEnable)
    408         {
    409             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
    410         }
    411 
    412         Value *vCurIndices;
    413         Value *startOffset;
    414         if(ied.InstanceEnable)
    415         {
    416             Value* stepRate = C(ied.InstanceAdvancementState);
    417 
    418             // prevent a div by 0 for 0 step rate
    419             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
    420             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
    421 
    422             // calc the current offset into instanced data buffer
    423             Value* calcInstance = UDIV(curInstance, stepRate);
    424 
    425             // if step rate is 0, every instance gets instance 0
    426             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
    427 
    428             vCurIndices = VBROADCAST(calcInstance);
    429 
    430             startOffset = startInstance;
    431         }
    432         else if (ied.InstanceStrideEnable)
    433         {
    434             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
    435         }
    436         else
    437         {
    438             // offset indices by baseVertex
    439             vCurIndices = ADD(vIndices, vBaseVertex);
    440 
    441             startOffset = startVertex;
    442         }
    443 
    444         // load SWR_VERTEX_BUFFER_STATE::pData
    445         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
    446 
    447         // load SWR_VERTEX_BUFFER_STATE::pitch
    448         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
    449         stride = Z_EXT(stride, mInt64Ty);
    450 
    451         // load SWR_VERTEX_BUFFER_STATE::size
    452         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
    453         size = Z_EXT(size, mInt64Ty);
    454 
    455         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
    456 
    457         Value *minVertex = NULL;
    458         Value *minVertexOffset = NULL;
    459         if (fetchState.bPartialVertexBuffer) {
    460             // fetch min index for low bounds checking
    461             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
    462             minVertex = LOAD(minVertex);
    463             if (!fetchState.bDisableIndexOOBCheck) {
    464                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
    465             }
    466         }
    467 
    468         // Load from the stream.
    469         for(uint32_t lane = 0; lane < mVWidth; ++lane)
    470         {
    471             // Get index
    472             Value* index = VEXTRACT(vCurIndices, C(lane));
    473 
    474             if (fetchState.bPartialVertexBuffer) {
    475                 // clamp below minvertex
    476                 Value *isBelowMin = ICMP_SLT(index, minVertex);
    477                 index = SELECT(isBelowMin, minVertex, index);
    478             }
    479 
    480             index = Z_EXT(index, mInt64Ty);
    481 
    482             Value*    offset = MUL(index, stride);
    483             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
    484             offset = ADD(offset, startVertexOffset);
    485 
    486             if (!fetchState.bDisableIndexOOBCheck) {
    487                 // check for out of bound access, including partial OOB, and replace them with minVertex
    488                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
    489                 Value *oob = ICMP_ULE(endOffset, size);
    490                 if (fetchState.bPartialVertexBuffer) {
    491                     offset = SELECT(oob, offset, minVertexOffset);
    492                 } else {
    493                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
    494                 }
    495             }
    496 
    497             Value*    pointer = GEP(stream, offset);
    498             // We use a full-lane, but don't actually care.
    499             Value*    vptr = 0;
    500 
    501             // get a pointer to a 4 component attrib in default address space
    502             switch(bpc)
    503             {
    504                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
    505                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
    506                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
    507                 default: SWR_INVALID("Unsupported underlying bpp!");
    508             }
    509 
    510             // load 4 components of attribute
    511             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
    512 
    513             // Convert To FP32 internally
    514             switch(info.type[0])
    515             {
    516                 case SWR_TYPE_UNORM:
    517                     switch(bpc)
    518                     {
    519                         case 8:
    520                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    521                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
    522                             break;
    523                         case 16:
    524                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    525                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
    526                             break;
    527                         default:
    528                             SWR_INVALID("Unsupported underlying type!");
    529                             break;
    530                     }
    531                     break;
    532                 case SWR_TYPE_SNORM:
    533                     switch(bpc)
    534                     {
    535                         case 8:
    536                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    537                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
    538                             break;
    539                         case 16:
    540                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    541                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
    542                             break;
    543                         default:
    544                             SWR_INVALID("Unsupported underlying type!");
    545                             break;
    546                     }
    547                     break;
    548                 case SWR_TYPE_UINT:
    549                     // Zero extend uint32_t types.
    550                     switch(bpc)
    551                     {
    552                         case 8:
    553                         case 16:
    554                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
    555                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
    556                             break;
    557                         case 32:
    558                             break; // Pass through unchanged.
    559                         default:
    560                             SWR_INVALID("Unsupported underlying type!");
    561                             break;
    562                     }
    563                     break;
    564                 case SWR_TYPE_SINT:
    565                     // Sign extend SINT types.
    566                     switch(bpc)
    567                     {
    568                         case 8:
    569                         case 16:
    570                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
    571                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
    572                             break;
    573                         case 32:
    574                             break; // Pass through unchanged.
    575                         default:
    576                             SWR_INVALID("Unsupported underlying type!");
    577                             break;
    578                     }
    579                     break;
    580                 case SWR_TYPE_FLOAT:
    581                     switch(bpc)
    582                     {
    583                         case 32:
    584                             break; // Pass through unchanged.
    585                         default:
    586                             SWR_INVALID("Unsupported underlying type!");
    587                     }
    588                     break;
    589                 case SWR_TYPE_USCALED:
    590                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    591                     break;
    592                 case SWR_TYPE_SSCALED:
    593                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    594                     break;
    595                 case SWR_TYPE_SFIXED:
    596                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
    597                     break;
    598                 case SWR_TYPE_UNKNOWN:
    599                 case SWR_TYPE_UNUSED:
    600                     SWR_INVALID("Unsupported type %d!", info.type[0]);
    601             }
    602 
    603             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
    604             // uwvec: 4 x F32, undef value
    605             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
    606             vectors.push_back(wvec);
    607         }
    608 
    609         std::vector<Constant*>        v01Mask(mVWidth);
    610         std::vector<Constant*>        v23Mask(mVWidth);
    611         std::vector<Constant*>        v02Mask(mVWidth);
    612         std::vector<Constant*>        v13Mask(mVWidth);
    613 
    614         // Concatenate the vectors together.
    615         elements[0] = VUNDEF_F();
    616         elements[1] = VUNDEF_F();
    617         elements[2] = VUNDEF_F();
    618         elements[3] = VUNDEF_F();
    619         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
    620         {
    621             v01Mask[4 * b + 0] = C(0 + 4 * b);
    622             v01Mask[4 * b + 1] = C(1 + 4 * b);
    623             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
    624             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
    625 
    626             v23Mask[4 * b + 0] = C(2 + 4 * b);
    627             v23Mask[4 * b + 1] = C(3 + 4 * b);
    628             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
    629             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
    630 
    631             v02Mask[4 * b + 0] = C(0 + 4 * b);
    632             v02Mask[4 * b + 1] = C(2 + 4 * b);
    633             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
    634             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
    635 
    636             v13Mask[4 * b + 0] = C(1 + 4 * b);
    637             v13Mask[4 * b + 1] = C(3 + 4 * b);
    638             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
    639             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
    640 
    641             std::vector<Constant*>    iMask(mVWidth);
    642             for(uint32_t i = 0; i < mVWidth; ++i)
    643             {
    644                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
    645                 {
    646                     iMask[i] = C(i % 4 + mVWidth);
    647                 }
    648                 else
    649                 {
    650                     iMask[i] = C(i);
    651                 }
    652             }
    653             Constant* insertMask = ConstantVector::get(iMask);
    654             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
    655             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
    656             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
    657             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
    658         }
    659 
    660         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
    661         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
    662         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
    663         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
    664         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
    665         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
    666         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
    667         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
    668 
    669         switch(numComponents + 1)
    670         {
    671             case    1: elements[0] = VIMMED1(0.0f);
    672             case    2: elements[1] = VIMMED1(0.0f);
    673             case    3: elements[2] = VIMMED1(0.0f);
    674             case    4: elements[3] = VIMMED1(1.0f);
    675         }
    676 
    677         for(uint32_t c = 0; c < 4; ++c)
    678         {
    679 #if USE_SIMD16_SHADERS
    680             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
    681 #else
    682             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
    683 #endif
    684             STORE(elements[c], dest);
    685         }
    686     }
    687 }
    688 
    689 // returns true for odd formats that require special state.gather handling
    690 bool FetchJit::IsOddFormat(SWR_FORMAT format)
    691 {
    692     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    693     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
    694     {
    695         return true;
    696     }
    697     return false;
    698 }
    699 
    700 // format is uniform if all components are the same size and type
    701 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
    702 {
    703     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    704     uint32_t bpc0 = info.bpc[0];
    705     uint32_t type0 = info.type[0];
    706 
    707     for (uint32_t c = 1; c < info.numComps; ++c)
    708     {
    709         if (bpc0 != info.bpc[c] || type0 != info.type[c])
    710         {
    711             return false;
    712         }
    713     }
    714     return true;
    715 }
    716 
    717 // unpacks components based on format
    718 // foreach component in the pixel
    719 //   mask off everything but this component
    720 //   shift component to LSB
    721 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
    722 {
    723     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    724 
    725     uint32_t bitOffset = 0;
    726     for (uint32_t c = 0; c < info.numComps; ++c)
    727     {
    728         uint32_t swizzledIndex = info.swizzle[c];
    729         uint32_t compBits = info.bpc[c];
    730         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
    731         Value* comp = AND(vInput, bitmask);
    732         comp = LSHR(comp, bitOffset);
    733 
    734         result[swizzledIndex] = comp;
    735         bitOffset += compBits;
    736     }
    737 }
    738 
    739 // gather for odd component size formats
    740 // gather SIMD full pixels per lane then shift/mask to move each component to their
    741 // own vector
    742 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
    743 {
    744     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
    745 
    746     // only works if pixel size is <= 32bits
    747     SWR_ASSERT(info.bpp <= 32);
    748 
    749     Value *pGather;
    750     if (info.bpp == 32)
    751     {
    752         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
    753     }
    754     else
    755     {
    756         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
    757         Value *pMem = ALLOCA(mSimdInt32Ty);
    758         STORE(VIMMED1(0u), pMem);
    759 
    760         pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
    761         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
    762 
    763         for (uint32_t lane = 0; lane < mVWidth; ++lane)
    764         {
    765             // Get index
    766             Value* index = VEXTRACT(pOffsets, C(lane));
    767             Value* mask = VEXTRACT(pMask, C(lane));
    768             switch (info.bpp)
    769             {
    770             case 8:
    771             {
    772                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
    773                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
    774                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
    775                 break;
    776             }
    777 
    778             case 16:
    779             {
    780                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
    781                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
    782                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
    783                 break;
    784             }
    785             break;
    786 
    787             case 24:
    788             {
    789                 // First 16-bits of data
    790                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
    791                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
    792                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
    793 
    794                 // Last 8-bits of data
    795                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
    796                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
    797                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
    798                 break;
    799             }
    800 
    801             default:
    802                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
    803                 break;
    804             }
    805         }
    806 
    807         pGather = LOAD(pMem);
    808     }
    809 
    810     for (uint32_t comp = 0; comp < 4; ++comp)
    811     {
    812         pResult[comp] = VIMMED1((int)info.defaults[comp]);
    813     }
    814 
    815     UnpackComponents(format, pGather, pResult);
    816 
    817     // cast to fp32
    818     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
    819     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
    820     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
    821     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
    822 }
    823 
    824 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
    825 {
    826     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
    827 
    828     for (uint32_t c = 0; c < info.numComps; ++c)
    829     {
    830         uint32_t compIndex = info.swizzle[c];
    831 
    832         // skip any conversion on UNUSED components
    833         if (info.type[c] == SWR_TYPE_UNUSED)
    834         {
    835             continue;
    836         }
    837 
    838         if (info.isNormalized[c])
    839         {
    840             if (info.type[c] == SWR_TYPE_SNORM)
    841             {
    842                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
    843 
    844                 /// result = c * (1.0f / (2^(n-1) - 1);
    845                 uint32_t n = info.bpc[c];
    846                 uint32_t pow2 = 1 << (n - 1);
    847                 float scale = 1.0f / (float)(pow2 - 1);
    848                 Value *vScale = VIMMED1(scale);
    849                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
    850                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
    851                 texels[compIndex] = FMUL(texels[compIndex], vScale);
    852             }
    853             else
    854             {
    855                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
    856 
    857                 /// result = c * (1.0f / (2^n - 1))
    858                 uint32_t n = info.bpc[c];
    859                 uint32_t pow2 = 1 << n;
    860                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
    861                 if (n == 24)
    862                 {
    863                     float scale = (float)(pow2 - 1);
    864                     Value* vScale = VIMMED1(scale);
    865                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
    866                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
    867                     texels[compIndex] = FDIV(texels[compIndex], vScale);
    868                 }
    869                 else
    870                 {
    871                     float scale = 1.0f / (float)(pow2 - 1);
    872                     Value *vScale = VIMMED1(scale);
    873                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
    874                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
    875                     texels[compIndex] = FMUL(texels[compIndex], vScale);
    876                 }
    877             }
    878             continue;
    879         }
    880     }
    881 }
    882 
    883 //////////////////////////////////////////////////////////////////////////
    884 /// @brief Loads attributes from memory using AVX2 GATHER(s)
    885 /// @param fetchState - info about attributes to be fetched from memory
    886 /// @param streams - value pointer to the current vertex stream
    887 /// @param vIndices - vector value of indices to gather
    888 /// @param pVtxOut - value pointer to output simdvertex struct
    889 #if USE_SIMD16_SHADERS
    890 #if USE_SIMD16_GATHERS
    891 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
    892     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
    893 #else
    894 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
    895     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
    896 #endif
    897 #else
    898 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
    899     Value* streams, Value* vIndices, Value* pVtxOut)
    900 #endif
    901 {
    902     uint32_t currentVertexElement = 0;
    903     uint32_t outputElt = 0;
    904     Value* vVertexElements[4];
    905 #if USE_SIMD16_GATHERS
    906     Value *pVtxSrc2[4];
    907 #endif
    908 
    909     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
    910     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
    911     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
    912 #if USE_SIMD16_GATHERS
    913     Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
    914 #else
    915     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
    916 #endif
    917     curInstance->setName("curInstance");
    918 
    919     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
    920     {
    921         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
    922 
    923         // skip element if all components are disabled
    924         if (ied.ComponentPacking == ComponentEnable::NONE)
    925         {
    926             continue;
    927         }
    928 
    929         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
    930         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
    931         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
    932 
    933         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
    934 
    935         // VGATHER* takes an *i8 src pointer
    936         Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
    937 
    938         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
    939 #if USE_SIMD16_GATHERS
    940         Value *vStride16 = VBROADCAST_16(stride);
    941 #else
    942         Value *vStride = VBROADCAST(stride);
    943 #endif
    944 
    945         // max vertex index that is fully in bounds
    946         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
    947         maxVertex = LOAD(maxVertex);
    948 
    949         Value *minVertex = NULL;
    950         if (fetchState.bPartialVertexBuffer)
    951         {
    952             // min vertex index for low bounds OOB checking
    953             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
    954             minVertex = LOAD(minVertex);
    955         }
    956 
    957         if (fetchState.bInstanceIDOffsetEnable)
    958         {
    959             // the InstanceID (curInstance) value is offset by StartInstanceLocation
    960             curInstance = ADD(curInstance, startInstance);
    961         }
    962 
    963 #if USE_SIMD16_GATHERS
    964         Value *vCurIndices16;
    965 #else
    966         Value *vCurIndices;
    967 #endif
    968         Value *startOffset;
    969 #if USE_SIMD16_GATHERS
    970         Value *vInstanceStride16 = VIMMED1_16(0);
    971 #else
    972         Value *vInstanceStride = VIMMED1(0);
    973 #endif
    974 
    975         if (ied.InstanceEnable)
    976         {
    977             Value* stepRate = C(ied.InstanceAdvancementState);
    978 
    979             // prevent a div by 0 for 0 step rate
    980             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
    981             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
    982 
    983             // calc the current offset into instanced data buffer
    984             Value* calcInstance = UDIV(curInstance, stepRate);
    985 
    986             // if step rate is 0, every instance gets instance 0
    987             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
    988 
    989 #if USE_SIMD16_GATHERS
    990             vCurIndices16 = VBROADCAST_16(calcInstance);
    991 #else
    992             vCurIndices = VBROADCAST(calcInstance);
    993 #endif
    994 
    995             startOffset = startInstance;
    996         }
    997         else if (ied.InstanceStrideEnable)
    998         {
    999             // grab the instance advancement state, determines stride in bytes from one instance to the next
   1000             Value* stepRate = C(ied.InstanceAdvancementState);
   1001 #if USE_SIMD16_GATHERS
   1002             vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
   1003 #else
   1004             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
   1005 #endif
   1006 
   1007             // offset indices by baseVertex
   1008 #if USE_SIMD16_GATHERS
   1009             Value *vIndices16 = JOIN_16(vIndices, vIndices2);
   1010 
   1011             vCurIndices16 = ADD(vIndices16, vBaseVertex16);
   1012 #else
   1013             vCurIndices = ADD(vIndices, vBaseVertex);
   1014 #endif
   1015 
   1016             startOffset = startVertex;
   1017             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
   1018         }
   1019         else
   1020         {
   1021             // offset indices by baseVertex
   1022 #if USE_SIMD16_GATHERS
   1023             Value *vIndices16 = JOIN_16(vIndices, vIndices2);
   1024 
   1025             vCurIndices16 = ADD(vIndices16, vBaseVertex16);
   1026 #else
   1027             vCurIndices = ADD(vIndices, vBaseVertex);
   1028 #endif
   1029 
   1030             startOffset = startVertex;
   1031         }
   1032 
   1033         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
   1034         // do 64bit address offset calculations.
   1035 
   1036         // calculate byte offset to the start of the VB
   1037         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
   1038         pStreamBase = GEP(pStreamBase, baseOffset);
   1039 
   1040         // if we have a start offset, subtract from max vertex. Used for OOB check
   1041         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
   1042         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
   1043         // if we have a negative value, we're already OOB. clamp at 0.
   1044         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
   1045 
   1046         if (fetchState.bPartialVertexBuffer)
   1047         {
   1048             // similary for min vertex
   1049             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
   1050             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
   1051             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
   1052         }
   1053 
   1054         // Load the in bounds size of a partially valid vertex
   1055         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
   1056         partialInboundsSize = LOAD(partialInboundsSize);
   1057 #if USE_SIMD16_GATHERS
   1058         Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
   1059         Value *vBpp = VBROADCAST_16(C(info.Bpp));
   1060         Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
   1061 #else
   1062         Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
   1063         Value *vBpp = VBROADCAST(C(info.Bpp));
   1064         Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
   1065 #endif
   1066 
   1067         // is the element is <= the partially valid size
   1068         Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
   1069 
   1070 #if USE_SIMD16_GATHERS
   1071         // override cur indices with 0 if pitch is 0
   1072         Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
   1073         vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
   1074 
   1075         // are vertices partially OOB?
   1076         Value *vMaxVertex16 = VBROADCAST_16(maxVertex);
   1077         Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
   1078 
   1079         // are vertices fully in bounds?
   1080         Value *vMaxGatherMask16 = ICMP_ULT(vCurIndices16, vMaxVertex16);
   1081 
   1082         Value *vGatherMask16;
   1083 
   1084         if (fetchState.bPartialVertexBuffer)
   1085         {
   1086             // are vertices below minVertex limit?
   1087             Value *vMinVertex16 = VBROADCAST_16(minVertex);
   1088             Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
   1089 
   1090             // only fetch lanes that pass both tests
   1091             vGatherMask16 = AND(vMaxGatherMask16, vMinGatherMask16);
   1092         }
   1093         else
   1094         {
   1095             vGatherMask16 = vMaxGatherMask16;
   1096         }
   1097 
   1098         // blend in any partially OOB indices that have valid elements
   1099         vGatherMask16 = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask16);
   1100 
   1101         // calculate the actual offsets into the VB
   1102         Value *vOffsets16 = MUL(vCurIndices16, vStride16);
   1103         vOffsets16 = ADD(vOffsets16, vAlignmentOffsets);
   1104 
   1105         // if instance stride enable is:
   1106         //  true  - add product of the instanceID and advancement state to the offst into the VB
   1107         //  false - value of vInstanceStride has been initialialized to zero
   1108         vOffsets16 = ADD(vOffsets16, vInstanceStride16);
   1109 
   1110         // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
   1111 
   1112         Value *vGatherMask  = EXTRACT_16(vGatherMask16, 0);
   1113         Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1);
   1114 
   1115         Value *vOffsets  = EXTRACT_16(vOffsets16, 0);
   1116         Value *vOffsets2 = EXTRACT_16(vOffsets16, 1);
   1117 #else
   1118         // override cur indices with 0 if pitch is 0
   1119         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
   1120         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
   1121 
   1122         // are vertices partially OOB?
   1123         Value* vMaxVertex = VBROADCAST(maxVertex);
   1124         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
   1125 
   1126         // are vertices fully in bounds?
   1127         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
   1128 
   1129         Value *vGatherMask;
   1130         if (fetchState.bPartialVertexBuffer)
   1131         {
   1132             // are vertices below minVertex limit?
   1133             Value *vMinVertex = VBROADCAST(minVertex);
   1134             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
   1135 
   1136             // only fetch lanes that pass both tests
   1137             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
   1138         }
   1139         else
   1140         {
   1141             vGatherMask = vMaxGatherMask;
   1142         }
   1143 
   1144         // blend in any partially OOB indices that have valid elements
   1145         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
   1146 
   1147         // calculate the actual offsets into the VB
   1148         Value* vOffsets = MUL(vCurIndices, vStride);
   1149         vOffsets = ADD(vOffsets, vAlignmentOffsets);
   1150 
   1151         // if instance stride enable is:
   1152         //  true  - add product of the instanceID and advancement state to the offst into the VB
   1153         //  false - value of vInstanceStride has been initialialized to zero
   1154         vOffsets = ADD(vOffsets, vInstanceStride);
   1155 
   1156 #endif
   1157         // Packing and component control
   1158         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
   1159         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
   1160                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
   1161 
   1162         // Special gather/conversion for formats without equal component sizes
   1163         if (IsOddFormat((SWR_FORMAT)ied.Format))
   1164         {
   1165 #if USE_SIMD16_GATHERS
   1166             Value *pResults[4];
   1167             Value *pResults2[4];
   1168             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask,  pStreamBase, vOffsets,  pResults);
   1169             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
   1170             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
   1171             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
   1172 
   1173             for (uint32_t c = 0; c < 4; c += 1)
   1174             {
   1175                 if (isComponentEnabled(compMask, c))
   1176                 {
   1177                     // pack adjacent pairs of SIMD8s into SIMD16s
   1178                     pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]);
   1179 
   1180                     if (currentVertexElement > 3)
   1181                     {
   1182                         // store SIMD16s
   1183                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1184 
   1185                         StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
   1186                         // reset to the next vVertexElement to output
   1187                         currentVertexElement = 0;
   1188                     }
   1189                 }
   1190             }
   1191 #else
   1192             Value *pResults[4];
   1193             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
   1194             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
   1195 
   1196             for (uint32_t c = 0; c < 4; c += 1)
   1197             {
   1198                 if (isComponentEnabled(compMask, c))
   1199                 {
   1200                     vVertexElements[currentVertexElement++] = pResults[c];
   1201                     if (currentVertexElement > 3)
   1202                     {
   1203                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1204                         // reset to the next vVertexElement to output
   1205                         currentVertexElement = 0;
   1206                     }
   1207                 }
   1208             }
   1209 #endif
   1210         }
   1211         else if(info.type[0] == SWR_TYPE_FLOAT)
   1212         {
   1213             ///@todo: support 64 bit vb accesses
   1214             Value *gatherSrc = VIMMED1(0.0f);
   1215 #if USE_SIMD16_GATHERS
   1216             Value *gatherSrc16 = VIMMED1_16(0.0f);
   1217 #endif
   1218 
   1219             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
   1220                 "Unsupported format for standard gather fetch.");
   1221 
   1222             // Gather components from memory to store in a simdvertex structure
   1223             switch (bpc)
   1224             {
   1225                 case 16:
   1226                 {
   1227 #if USE_SIMD16_GATHERS
   1228                     Value *gatherResult[2];
   1229 
   1230                     // if we have at least one component out of x or y to fetch
   1231                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
   1232                     {
   1233                         gatherResult[0] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
   1234 
   1235                         // e.g. result of first 8x32bit integer gather for 16bit components
   1236                         // 256i - 0    1    2    3    4    5    6    7
   1237                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1238                         //
   1239                     }
   1240                     else
   1241                     {
   1242                         gatherResult[0] = VUNDEF_I_16();
   1243                     }
   1244 
   1245                     // if we have at least one component out of z or w to fetch
   1246                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
   1247                     {
   1248                         // offset base to the next components(zw) in the vertex to gather
   1249                         pStreamBase = GEP(pStreamBase, C((char)4));
   1250 
   1251                         gatherResult[1] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
   1252 
   1253                         // e.g. result of second 8x32bit integer gather for 16bit components
   1254                         // 256i - 0    1    2    3    4    5    6    7
   1255                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1256                         //
   1257                     }
   1258                     else
   1259                     {
   1260                         gatherResult[1] = VUNDEF_I_16();
   1261                     }
   1262 
   1263                     // if we have at least one component to shuffle into place
   1264                     if (compMask)
   1265                     {
   1266                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1267 
   1268                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
   1269                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
   1270 
   1271                         // Shuffle gathered components into place in simdvertex struct
   1272                         Shuffle16bpcGather16(args);  // outputs to vVertexElements ref
   1273                     }
   1274 #else
   1275                     Value *vGatherResult[2];
   1276 
   1277                     // if we have at least one component out of x or y to fetch
   1278                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
   1279                     {
   1280                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
   1281                         // e.g. result of first 8x32bit integer gather for 16bit components
   1282                         // 256i - 0    1    2    3    4    5    6    7
   1283                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1284                         //
   1285                     }
   1286 
   1287                     // if we have at least one component out of z or w to fetch
   1288                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
   1289                     {
   1290                         // offset base to the next components(zw) in the vertex to gather
   1291                         pStreamBase = GEP(pStreamBase, C((char)4));
   1292 
   1293                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
   1294                         // e.g. result of second 8x32bit integer gather for 16bit components
   1295                         // 256i - 0    1    2    3    4    5    6    7
   1296                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1297                         //
   1298                     }
   1299 
   1300                     // if we have at least one component to shuffle into place
   1301                     if (compMask)
   1302                     {
   1303                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
   1304                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
   1305 
   1306                         // Shuffle gathered components into place in simdvertex struct
   1307 #if USE_SIMD16_SHADERS
   1308                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
   1309 #else
   1310                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
   1311 #endif
   1312                     }
   1313 #endif
   1314                 }
   1315                     break;
   1316                 case 32:
   1317                 {
   1318                     for (uint32_t i = 0; i < 4; i += 1)
   1319                     {
   1320 #if USE_SIMD16_GATHERS
   1321                         if (isComponentEnabled(compMask, i))
   1322                         {
   1323                             // if we need to gather the component
   1324                             if (compCtrl[i] == StoreSrc)
   1325                             {
   1326                                 // Gather a SIMD of vertices
   1327                                 // APIs allow a 4GB range for offsets
   1328                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
   1329                                 // But, we know that elements must be aligned for FETCH. :)
   1330                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
   1331                                 Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
   1332                                 pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
   1333                             }
   1334                             else
   1335                             {
   1336                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   1337                             }
   1338 
   1339                             if (currentVertexElement > 3)
   1340                             {
   1341                                 // store SIMD16s
   1342                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1343 
   1344                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
   1345                                 // reset to the next vVertexElement to output
   1346                                 currentVertexElement = 0;
   1347                             }
   1348                         }
   1349 
   1350                         // offset base to the next component in the vertex to gather
   1351                         pStreamBase = GEP(pStreamBase, C((char)4));
   1352 #else
   1353                         if (isComponentEnabled(compMask, i))
   1354                         {
   1355                             // if we need to gather the component
   1356                             if (compCtrl[i] == StoreSrc)
   1357                             {
   1358                                 // Gather a SIMD of vertices
   1359                                 // APIs allow a 4GB range for offsets
   1360                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
   1361                                 // But, we know that elements must be aligned for FETCH. :)
   1362                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
   1363                                 Value *vShiftedOffsets = LSHR(vOffsets, 1);
   1364                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
   1365                             }
   1366                             else
   1367                             {
   1368 #if USE_SIMD16_SHADERS
   1369                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
   1370 #else
   1371                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1372 #endif
   1373                             }
   1374 
   1375                             if (currentVertexElement > 3)
   1376                             {
   1377                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1378                                 // reset to the next vVertexElement to output
   1379                                 currentVertexElement = 0;
   1380                             }
   1381                         }
   1382 
   1383                         // offset base to the next component in the vertex to gather
   1384                         pStreamBase = GEP(pStreamBase, C((char)4));
   1385 #endif
   1386                     }
   1387                 }
   1388                     break;
   1389                 case 64:
   1390                 {
   1391                     for (uint32_t i = 0; i < 4; i += 1)
   1392                     {
   1393 #if USE_SIMD16_GATHERS
   1394                         if (isComponentEnabled(compMask, i))
   1395                         {
   1396                             // if we need to gather the component
   1397                             if (compCtrl[i] == StoreSrc)
   1398                             {
   1399                                 Value *vMaskLo  = VSHUFFLE(vGatherMask,  VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
   1400                                 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
   1401                                 Value *vMaskHi  = VSHUFFLE(vGatherMask,  VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
   1402                                 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
   1403 
   1404                                 Value *vOffsetsLo  = VEXTRACTI128(vOffsets,  C(0));
   1405                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
   1406                                 Value *vOffsetsHi  = VEXTRACTI128(vOffsets,  C(1));
   1407                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
   1408 
   1409                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
   1410 
   1411                                 Value* pGatherLo  = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo,  vMaskLo);
   1412                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
   1413                                 Value* pGatherHi  = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi,  vMaskHi);
   1414                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
   1415 
   1416                                 pGatherLo  = VCVTPD2PS(pGatherLo);
   1417                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
   1418                                 pGatherHi  = VCVTPD2PS(pGatherHi);
   1419                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
   1420 
   1421                                 Value *pGather  = VSHUFFLE(pGatherLo,  pGatherHi,  C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
   1422                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
   1423 
   1424                                 // pack adjacent pairs of SIMD8s into SIMD16s
   1425                                 pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2);
   1426                             }
   1427                             else
   1428                             {
   1429                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   1430                             }
   1431 
   1432                             if (currentVertexElement > 3)
   1433                             {
   1434                                 // store SIMD16s
   1435                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1436 
   1437                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
   1438                                 // reset to the next vVertexElement to output
   1439                                 currentVertexElement = 0;
   1440                             }
   1441                         }
   1442 
   1443                         // offset base to the next component  in the vertex to gather
   1444                         pStreamBase = GEP(pStreamBase, C((char)8));
   1445 #else
   1446                         if (isComponentEnabled(compMask, i))
   1447                         {
   1448                             // if we need to gather the component
   1449                             if (compCtrl[i] == StoreSrc)
   1450                             {
   1451                                 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
   1452                                 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
   1453 
   1454                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
   1455                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
   1456 
   1457                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
   1458 
   1459                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
   1460                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
   1461 
   1462                                 pGatherLo = VCVTPD2PS(pGatherLo);
   1463                                 pGatherHi = VCVTPD2PS(pGatherHi);
   1464 
   1465                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
   1466 
   1467                                 vVertexElements[currentVertexElement++] = pGather;
   1468                             }
   1469                             else
   1470                             {
   1471 #if USE_SIMD16_SHADERS
   1472                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
   1473 #else
   1474                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1475 #endif
   1476                             }
   1477 
   1478                             if (currentVertexElement > 3)
   1479                             {
   1480                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1481                                 // reset to the next vVertexElement to output
   1482                                 currentVertexElement = 0;
   1483                             }
   1484                         }
   1485 
   1486                         // offset base to the next component  in the vertex to gather
   1487                         pStreamBase = GEP(pStreamBase, C((char)8));
   1488 #endif
   1489                     }
   1490                 }
   1491                     break;
   1492                 default:
   1493                     SWR_INVALID("Tried to fetch invalid FP format");
   1494                     break;
   1495             }
   1496         }
   1497         else
   1498         {
   1499             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
   1500             ConversionType conversionType = CONVERT_NONE;
   1501 
   1502             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
   1503                 "Unsupported format for standard gather fetch.");
   1504 
   1505             switch(info.type[0])
   1506             {
   1507                 case SWR_TYPE_UNORM:
   1508                     conversionType = CONVERT_NORMALIZED;
   1509                 case SWR_TYPE_UINT:
   1510                     extendCastType = Instruction::CastOps::ZExt;
   1511                     break;
   1512                 case SWR_TYPE_SNORM:
   1513                     conversionType = CONVERT_NORMALIZED;
   1514                 case SWR_TYPE_SINT:
   1515                     extendCastType = Instruction::CastOps::SExt;
   1516                     break;
   1517                 case SWR_TYPE_USCALED:
   1518                     conversionType = CONVERT_USCALED;
   1519                     extendCastType = Instruction::CastOps::UIToFP;
   1520                     break;
   1521                 case SWR_TYPE_SSCALED:
   1522                     conversionType = CONVERT_SSCALED;
   1523                     extendCastType = Instruction::CastOps::SIToFP;
   1524                     break;
   1525                 case SWR_TYPE_SFIXED:
   1526                     conversionType = CONVERT_SFIXED;
   1527                     extendCastType = Instruction::CastOps::SExt;
   1528                     break;
   1529                 default:
   1530                     break;
   1531             }
   1532 
   1533             // value substituted when component of gather is masked
   1534             Value* gatherSrc = VIMMED1(0);
   1535 #if USE_SIMD16_GATHERS
   1536             Value *gatherSrc16 = VIMMED1_16(0);
   1537 #endif
   1538 
   1539             // Gather components from memory to store in a simdvertex structure
   1540             switch (bpc)
   1541             {
   1542                 case 8:
   1543                 {
   1544                     // if we have at least one component to fetch
   1545                     if (compMask)
   1546                     {
   1547 #if USE_SIMD16_GATHERS
   1548                         Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
   1549 
   1550                         // e.g. result of an 8x32bit integer gather for 8bit components
   1551                         // 256i - 0    1    2    3    4    5    6    7
   1552                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
   1553 
   1554                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1555 
   1556                         Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
   1557                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
   1558 
   1559                         // Shuffle gathered components into place in simdvertex struct
   1560                         Shuffle8bpcGatherd16(args);  // outputs to vVertexElements ref
   1561 #else
   1562                         Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
   1563                         // e.g. result of an 8x32bit integer gather for 8bit components
   1564                         // 256i - 0    1    2    3    4    5    6    7
   1565                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
   1566 
   1567                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
   1568                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
   1569 
   1570                         // Shuffle gathered components into place in simdvertex struct
   1571 #if USE_SIMD16_SHADERS
   1572                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
   1573 #else
   1574                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
   1575 #endif
   1576 #endif
   1577                     }
   1578                 }
   1579                 break;
   1580                 case 16:
   1581                 {
   1582 #if USE_SIMD16_GATHERS
   1583                     Value *gatherResult[2];
   1584 
   1585                     // if we have at least one component out of x or y to fetch
   1586                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
   1587                     {
   1588                         gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
   1589 
   1590                         // e.g. result of first 8x32bit integer gather for 16bit components
   1591                         // 256i - 0    1    2    3    4    5    6    7
   1592                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1593                         //
   1594                     }
   1595                     else
   1596                     {
   1597                         gatherResult[0] = VUNDEF_I_16();
   1598                     }
   1599 
   1600                     // if we have at least one component out of z or w to fetch
   1601                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
   1602                     {
   1603                         // offset base to the next components(zw) in the vertex to gather
   1604                         pStreamBase = GEP(pStreamBase, C((char)4));
   1605 
   1606                         gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
   1607 
   1608                         // e.g. result of second 8x32bit integer gather for 16bit components
   1609                         // 256i - 0    1    2    3    4    5    6    7
   1610                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1611                         //
   1612                     }
   1613                     else
   1614                     {
   1615                         gatherResult[1] = VUNDEF_I_16();
   1616                     }
   1617 
   1618                     // if we have at least one component to shuffle into place
   1619                     if (compMask)
   1620                     {
   1621                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1622 
   1623                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
   1624                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
   1625 
   1626                         // Shuffle gathered components into place in simdvertex struct
   1627                         Shuffle16bpcGather16(args);  // outputs to vVertexElements ref
   1628                     }
   1629 #else
   1630                     Value *vGatherResult[2];
   1631 
   1632                     // if we have at least one component out of x or y to fetch
   1633                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
   1634                     {
   1635                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
   1636                         // e.g. result of first 8x32bit integer gather for 16bit components
   1637                         // 256i - 0    1    2    3    4    5    6    7
   1638                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1639                         //
   1640                     }
   1641 
   1642                     // if we have at least one component out of z or w to fetch
   1643                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
   1644                     {
   1645                         // offset base to the next components(zw) in the vertex to gather
   1646                         pStreamBase = GEP(pStreamBase, C((char)4));
   1647 
   1648                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
   1649                         // e.g. result of second 8x32bit integer gather for 16bit components
   1650                         // 256i - 0    1    2    3    4    5    6    7
   1651                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1652                         //
   1653                     }
   1654 
   1655                     // if we have at least one component to shuffle into place
   1656                     if (compMask)
   1657                     {
   1658                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
   1659                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
   1660 
   1661                         // Shuffle gathered components into place in simdvertex struct
   1662 #if USE_SIMD16_SHADERS
   1663                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
   1664 #else
   1665                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
   1666 #endif
   1667                     }
   1668 #endif
   1669                 }
   1670                 break;
   1671                 case 32:
   1672                 {
   1673                     // Gathered components into place in simdvertex struct
   1674                     for (uint32_t i = 0; i < 4; i++)
   1675                     {
   1676                         if (isComponentEnabled(compMask, i))
   1677                         {
   1678                             // if we need to gather the component
   1679                             if (compCtrl[i] == StoreSrc)
   1680                             {
   1681 #if USE_SIMD16_GATHERS
   1682                                 Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
   1683 
   1684                                 if (conversionType == CONVERT_USCALED)
   1685                                 {
   1686                                     pGather = UI_TO_FP(pGather, mSimd16FP32Ty);
   1687                                 }
   1688                                 else if (conversionType == CONVERT_SSCALED)
   1689                                 {
   1690                                     pGather = SI_TO_FP(pGather, mSimd16FP32Ty);
   1691                                 }
   1692                                 else if (conversionType == CONVERT_SFIXED)
   1693                                 {
   1694                                     pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
   1695                                 }
   1696 
   1697                                 pVtxSrc2[currentVertexElement++] = pGather;
   1698 
   1699                                 // e.g. result of a single 8x32bit integer gather for 32bit components
   1700                                 // 256i - 0    1    2    3    4    5    6    7
   1701                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
   1702 #else
   1703                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
   1704 
   1705                                 if (conversionType == CONVERT_USCALED)
   1706                                 {
   1707                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
   1708                                 }
   1709                                 else if (conversionType == CONVERT_SSCALED)
   1710                                 {
   1711                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
   1712                                 }
   1713                                 else if (conversionType == CONVERT_SFIXED)
   1714                                 {
   1715                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
   1716                                 }
   1717 
   1718                                 vVertexElements[currentVertexElement++] = pGather;
   1719 
   1720                                 // e.g. result of a single 8x32bit integer gather for 32bit components
   1721                                 // 256i - 0    1    2    3    4    5    6    7
   1722                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
   1723 #endif
   1724                             }
   1725                             else
   1726                             {
   1727 #if USE_SIMD16_GATHERS
   1728                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   1729 #else
   1730 #if USE_SIMD16_SHADERS
   1731                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
   1732 #else
   1733                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1734 #endif
   1735 #endif
   1736                             }
   1737 
   1738                             if (currentVertexElement > 3)
   1739                             {
   1740 #if USE_SIMD16_GATHERS
   1741                                 // store SIMD16s
   1742                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1743 
   1744                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
   1745 #else
   1746                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1747 #endif
   1748 
   1749                                 // reset to the next vVertexElement to output
   1750                                 currentVertexElement = 0;
   1751                             }
   1752 
   1753                         }
   1754 
   1755                         // offset base to the next component  in the vertex to gather
   1756                         pStreamBase = GEP(pStreamBase, C((char)4));
   1757                     }
   1758                 }
   1759                 break;
   1760             }
   1761         }
   1762     }
   1763 
   1764     // if we have a partially filled vVertexElement struct, output it
   1765     if (currentVertexElement > 0)
   1766     {
   1767 #if USE_SIMD16_GATHERS
   1768         // store SIMD16s
   1769         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
   1770 
   1771         StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2);
   1772 #else
   1773         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
   1774 #endif
   1775     }
   1776 }
   1777 
   1778 //////////////////////////////////////////////////////////////////////////
   1779 /// @brief Loads a simd of valid indices. OOB indices are set to 0
   1780 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
   1781 /// support
   1782 /// @param pIndices - pointer to 8 bit indices
   1783 /// @param pLastIndex - pointer to last valid index
   1784 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
   1785 {
   1786     // can fit 2 16 bit integers per vWidth lane
   1787     Value* vIndices =  VUNDEF_I();
   1788 
   1789     // store 0 index on stack to be used to conditionally load from if index address is OOB
   1790     Value* pZeroIndex = ALLOCA(mInt8Ty);
   1791     STORE(C((uint8_t)0), pZeroIndex);
   1792 
   1793     // Load a SIMD of index pointers
   1794     for(int64_t lane = 0; lane < mVWidth; lane++)
   1795     {
   1796         // Calculate the address of the requested index
   1797         Value *pIndex = GEP(pIndices, C(lane));
   1798 
   1799         // check if the address is less than the max index,
   1800         Value* mask = ICMP_ULT(pIndex, pLastIndex);
   1801 
   1802         // if valid, load the index. if not, load 0 from the stack
   1803         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
   1804         Value *index = LOAD(pValid, "valid index");
   1805 
   1806         // zero extended index to 32 bits and insert into the correct simd lane
   1807         index = Z_EXT(index, mInt32Ty);
   1808         vIndices = VINSERT(vIndices, index, lane);
   1809     }
   1810     return vIndices;
   1811 }
   1812 
   1813 //////////////////////////////////////////////////////////////////////////
   1814 /// @brief Loads a simd of valid indices. OOB indices are set to 0
   1815 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
   1816 /// support
   1817 /// @param pIndices - pointer to 16 bit indices
   1818 /// @param pLastIndex - pointer to last valid index
   1819 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
   1820 {
   1821     // can fit 2 16 bit integers per vWidth lane
   1822     Value* vIndices =  VUNDEF_I();
   1823 
   1824     // store 0 index on stack to be used to conditionally load from if index address is OOB
   1825     Value* pZeroIndex = ALLOCA(mInt16Ty);
   1826     STORE(C((uint16_t)0), pZeroIndex);
   1827 
   1828     // Load a SIMD of index pointers
   1829     for(int64_t lane = 0; lane < mVWidth; lane++)
   1830     {
   1831         // Calculate the address of the requested index
   1832         Value *pIndex = GEP(pIndices, C(lane));
   1833 
   1834         // check if the address is less than the max index,
   1835         Value* mask = ICMP_ULT(pIndex, pLastIndex);
   1836 
   1837         // if valid, load the index. if not, load 0 from the stack
   1838         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
   1839         Value *index = LOAD(pValid, "valid index");
   1840 
   1841         // zero extended index to 32 bits and insert into the correct simd lane
   1842         index = Z_EXT(index, mInt32Ty);
   1843         vIndices = VINSERT(vIndices, index, lane);
   1844     }
   1845     return vIndices;
   1846 }
   1847 
   1848 //////////////////////////////////////////////////////////////////////////
   1849 /// @brief Loads a simd of valid indices. OOB indices are set to 0
   1850 /// @param pIndices - pointer to 32 bit indices
   1851 /// @param pLastIndex - pointer to last valid index
   1852 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
   1853 {
   1854     DataLayout dL(JM()->mpCurrentModule);
   1855     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
   1856     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
   1857     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
   1858 
   1859     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
   1860     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
   1861     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
   1862     numIndicesLeft = SDIV(numIndicesLeft, C(4));
   1863 
   1864     // create a vector of index counts from the base index ptr passed into the fetch
   1865     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
   1866     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
   1867 
   1868     // compare index count to the max valid index
   1869     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
   1870     //     vIndexOffsets  0 1 2 3 4 5 6 7
   1871     //     ------------------------------
   1872     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
   1873     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
   1874     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
   1875     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
   1876 
   1877     // VMASKLOAD takes an *i8 src pointer
   1878     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
   1879 
   1880     // Load the indices; OOB loads 0
   1881     return MASKLOADD(pIndices,vIndexMask);
   1882 }
   1883 
   1884 //////////////////////////////////////////////////////////////////////////
   1885 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
   1886 /// denormalizes if needed, converts to F32 if needed, and positions in
   1887 //  the proper SIMD rows to be output to the simdvertex structure
   1888 /// @param args: (tuple of args, listed below)
   1889 ///   @param vGatherResult - 8 gathered 8bpc vertices
   1890 ///   @param pVtxOut - base pointer to output simdvertex struct
   1891 ///   @param extendType - sign extend or zero extend
   1892 ///   @param bNormalized - do we need to denormalize?
   1893 ///   @param currentVertexElement - reference to the current vVertexElement
   1894 ///   @param outputElt - reference to the current offset from simdvertex we're o
   1895 ///   @param compMask - component packing mask
   1896 ///   @param compCtrl - component control val
   1897 ///   @param vVertexElements[4] - vertex components to output
   1898 ///   @param swizzle[4] - component swizzle location
   1899 #if USE_SIMD16_GATHERS
   1900 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
   1901 {
   1902     // Unpack tuple args
   1903     Value*& vGatherResult = std::get<0>(args);
   1904     Value* pVtxOut = std::get<1>(args);
   1905     const Instruction::CastOps extendType = std::get<2>(args);
   1906     const ConversionType conversionType = std::get<3>(args);
   1907     uint32_t &currentVertexElement = std::get<4>(args);
   1908     uint32_t &outputElt = std::get<5>(args);
   1909     const ComponentEnable compMask = std::get<6>(args);
   1910     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
   1911     Value* (&vVertexElements)[4] = std::get<8>(args);
   1912     const uint32_t(&swizzle)[4] = std::get<9>(args);
   1913 
   1914     // cast types
   1915     Type *vGatherTy = mSimdInt32Ty;
   1916     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   1917 
   1918     // have to do extra work for sign extending
   1919     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
   1920     {
   1921         Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
   1922         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1923 
   1924         // shuffle mask, including any swizzling
   1925         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
   1926         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
   1927         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
   1928             char(y), char(y + 4), char(y + 8), char(y + 12),
   1929             char(z), char(z + 4), char(z + 8), char(z + 12),
   1930             char(w), char(w + 4), char(w + 8), char(w + 12),
   1931             char(x), char(x + 4), char(x + 8), char(x + 12),
   1932             char(y), char(y + 4), char(y + 8), char(y + 12),
   1933             char(z), char(z + 4), char(z + 8), char(z + 12),
   1934             char(w), char(w + 4), char(w + 8), char(w + 12) });
   1935 
   1936         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
   1937 
   1938         Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
   1939         Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
   1940 
   1941         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
   1942         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
   1943 
   1944         // after pshufb: group components together in each 128bit lane
   1945         // 256i - 0    1    2    3    4    5    6    7
   1946         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
   1947 
   1948         Value *vi128XY_lo = nullptr;
   1949         Value *vi128XY_hi = nullptr;
   1950         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
   1951         {
   1952             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
   1953             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
   1954 
   1955             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
   1956             // 256i - 0    1    2    3    4    5    6    7
   1957             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
   1958         }
   1959 
   1960         // do the same for zw components
   1961         Value *vi128ZW_lo = nullptr;
   1962         Value *vi128ZW_hi = nullptr;
   1963         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
   1964         {
   1965             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
   1966             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
   1967         }
   1968 
   1969         // init denormalize variables if needed
   1970         Instruction::CastOps fpCast;
   1971         Value *conversionFactor;
   1972 
   1973         switch (conversionType)
   1974         {
   1975         case CONVERT_NORMALIZED:
   1976             fpCast = Instruction::CastOps::SIToFP;
   1977             conversionFactor = VIMMED1((float)(1.0 / 127.0));
   1978             break;
   1979         case CONVERT_SSCALED:
   1980             fpCast = Instruction::CastOps::SIToFP;
   1981             conversionFactor = VIMMED1((float)(1.0));
   1982             break;
   1983         case CONVERT_USCALED:
   1984             SWR_INVALID("Type should not be sign extended!");
   1985             conversionFactor = nullptr;
   1986             break;
   1987         default:
   1988             SWR_ASSERT(conversionType == CONVERT_NONE);
   1989             conversionFactor = nullptr;
   1990             break;
   1991         }
   1992 
   1993         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   1994         for (uint32_t i = 0; i < 4; i++)
   1995         {
   1996             if (isComponentEnabled(compMask, i))
   1997             {
   1998                 if (compCtrl[i] == ComponentControl::StoreSrc)
   1999                 {
   2000                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   2001                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   2002                     // if x or y, use vi128XY permute result, else use vi128ZW
   2003                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
   2004                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
   2005 
   2006                     // sign extend
   2007                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
   2008                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
   2009 
   2010                     // denormalize if needed
   2011                     if (conversionType != CONVERT_NONE)
   2012                     {
   2013                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
   2014                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
   2015                     }
   2016 
   2017                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
   2018 
   2019                     currentVertexElement += 1;
   2020                 }
   2021                 else
   2022                 {
   2023                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   2024                 }
   2025 
   2026                 if (currentVertexElement > 3)
   2027                 {
   2028                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
   2029                     // reset to the next vVertexElement to output
   2030                     currentVertexElement = 0;
   2031                 }
   2032             }
   2033         }
   2034     }
   2035     // else zero extend
   2036     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
   2037     {
   2038         // init denormalize variables if needed
   2039         Instruction::CastOps fpCast;
   2040         Value *conversionFactor;
   2041 
   2042         switch (conversionType)
   2043         {
   2044         case CONVERT_NORMALIZED:
   2045             fpCast = Instruction::CastOps::UIToFP;
   2046             conversionFactor = VIMMED1((float)(1.0 / 255.0));
   2047             break;
   2048         case CONVERT_USCALED:
   2049             fpCast = Instruction::CastOps::UIToFP;
   2050             conversionFactor = VIMMED1((float)(1.0));
   2051             break;
   2052         case CONVERT_SSCALED:
   2053             SWR_INVALID("Type should not be zero extended!");
   2054             conversionFactor = nullptr;
   2055             break;
   2056         default:
   2057             SWR_ASSERT(conversionType == CONVERT_NONE);
   2058             conversionFactor = nullptr;
   2059             break;
   2060         }
   2061 
   2062         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
   2063         for (uint32_t i = 0; i < 4; i++)
   2064         {
   2065             if (isComponentEnabled(compMask, i))
   2066             {
   2067                 if (compCtrl[i] == ComponentControl::StoreSrc)
   2068                 {
   2069                     // pshufb masks for each component
   2070                     Value *vConstMask;
   2071                     switch (swizzle[i])
   2072                     {
   2073                     case 0:
   2074                         // x shuffle mask
   2075                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
   2076                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
   2077                         break;
   2078                     case 1:
   2079                         // y shuffle mask
   2080                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
   2081                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
   2082                         break;
   2083                     case 2:
   2084                         // z shuffle mask
   2085                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
   2086                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
   2087                         break;
   2088                     case 3:
   2089                         // w shuffle mask
   2090                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
   2091                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
   2092                         break;
   2093                     default:
   2094                         vConstMask = nullptr;
   2095                         break;
   2096                     }
   2097 
   2098                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
   2099                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
   2100 
   2101                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
   2102                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
   2103 
   2104                     // after pshufb for x channel
   2105                     // 256i - 0    1    2    3    4    5    6    7
   2106                     //        x000 x000 x000 x000 x000 x000 x000 x000
   2107 
   2108                     // denormalize if needed
   2109                     if (conversionType != CONVERT_NONE)
   2110                     {
   2111                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
   2112                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
   2113                     }
   2114 
   2115                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
   2116 
   2117                     currentVertexElement += 1;
   2118                 }
   2119                 else
   2120                 {
   2121                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   2122                 }
   2123 
   2124                 if (currentVertexElement > 3)
   2125                 {
   2126                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
   2127                     // reset to the next vVertexElement to output
   2128                     currentVertexElement = 0;
   2129                 }
   2130             }
   2131         }
   2132     }
   2133     else
   2134     {
   2135         SWR_INVALID("Unsupported conversion type");
   2136     }
   2137 }
   2138 
   2139 #else
   2140 #if USE_SIMD16_SHADERS
   2141 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
   2142 #else
   2143 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
   2144 #endif
   2145 {
   2146     // Unpack tuple args
   2147     Value*& vGatherResult = std::get<0>(args);
   2148     Value* pVtxOut = std::get<1>(args);
   2149     const Instruction::CastOps extendType = std::get<2>(args);
   2150     const ConversionType conversionType = std::get<3>(args);
   2151     uint32_t &currentVertexElement = std::get<4>(args);
   2152     uint32_t &outputElt = std::get<5>(args);
   2153     const ComponentEnable compMask = std::get<6>(args);
   2154     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
   2155     Value* (&vVertexElements)[4] = std::get<8>(args);
   2156     const uint32_t(&swizzle)[4] = std::get<9>(args);
   2157 
   2158     // cast types
   2159     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   2160 
   2161     for (uint32_t i = 0; i < 4; i++)
   2162     {
   2163         if (!isComponentEnabled(compMask, i))
   2164             continue;
   2165 
   2166         if (compCtrl[i] == ComponentControl::StoreSrc)
   2167         {
   2168             std::vector<uint32_t> vShuffleMasks[4] = {
   2169                 { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
   2170                 { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
   2171                 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
   2172                 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
   2173             };
   2174 
   2175             Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
   2176                 UndefValue::get(v32x8Ty),
   2177                 vShuffleMasks[swizzle[i]]);
   2178 
   2179             if ((extendType == Instruction::CastOps::SExt) ||
   2180                 (extendType == Instruction::CastOps::SIToFP)) {
   2181                 switch (conversionType)
   2182                 {
   2183                 case CONVERT_NORMALIZED:
   2184                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
   2185                     break;
   2186                 case CONVERT_SSCALED:
   2187                     val = SI_TO_FP(val, mSimdFP32Ty);
   2188                     break;
   2189                 case CONVERT_USCALED:
   2190                     SWR_INVALID("Type should not be sign extended!");
   2191                     break;
   2192                 default:
   2193                     SWR_ASSERT(conversionType == CONVERT_NONE);
   2194                     val = S_EXT(val, mSimdInt32Ty);
   2195                     break;
   2196                 }
   2197             }
   2198             else if ((extendType == Instruction::CastOps::ZExt) ||
   2199                 (extendType == Instruction::CastOps::UIToFP)) {
   2200                 switch (conversionType)
   2201                 {
   2202                 case CONVERT_NORMALIZED:
   2203                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
   2204                     break;
   2205                 case CONVERT_SSCALED:
   2206                     SWR_INVALID("Type should not be zero extended!");
   2207                     break;
   2208                 case CONVERT_USCALED:
   2209                     val = UI_TO_FP(val, mSimdFP32Ty);
   2210                     break;
   2211                 default:
   2212                     SWR_ASSERT(conversionType == CONVERT_NONE);
   2213                     val = Z_EXT(val, mSimdInt32Ty);
   2214                     break;
   2215                 }
   2216             }
   2217             else
   2218             {
   2219                 SWR_INVALID("Unsupported conversion type");
   2220             }
   2221 
   2222             vVertexElements[currentVertexElement++] = val;
   2223         }
   2224         else
   2225         {
   2226 #if USE_SIMD16_SHADERS
   2227             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
   2228 #else
   2229             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   2230 #endif
   2231         }
   2232 
   2233         if (currentVertexElement > 3)
   2234         {
   2235             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   2236             // reset to the next vVertexElement to output
   2237             currentVertexElement = 0;
   2238         }
   2239     }
   2240 }
   2241 
   2242 #endif
   2243 //////////////////////////////////////////////////////////////////////////
   2244 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
   2245 /// denormalizes if needed, converts to F32 if needed, and positions in
   2246 //  the proper SIMD rows to be output to the simdvertex structure
   2247 /// @param args: (tuple of args, listed below)
   2248 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
   2249 ///   @param pVtxOut - base pointer to output simdvertex struct
   2250 ///   @param extendType - sign extend or zero extend
   2251 ///   @param bNormalized - do we need to denormalize?
   2252 ///   @param currentVertexElement - reference to the current vVertexElement
   2253 ///   @param outputElt - reference to the current offset from simdvertex we're o
   2254 ///   @param compMask - component packing mask
   2255 ///   @param compCtrl - component control val
   2256 ///   @param vVertexElements[4] - vertex components to output
   2257 #if USE_SIMD16_GATHERS
   2258 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
   2259 {
   2260     // Unpack tuple args
   2261     Value* (&vGatherResult)[2] = std::get<0>(args);
   2262     Value* pVtxOut = std::get<1>(args);
   2263     const Instruction::CastOps extendType = std::get<2>(args);
   2264     const ConversionType conversionType = std::get<3>(args);
   2265     uint32_t &currentVertexElement = std::get<4>(args);
   2266     uint32_t &outputElt = std::get<5>(args);
   2267     const ComponentEnable compMask = std::get<6>(args);
   2268     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
   2269     Value* (&vVertexElements)[4] = std::get<8>(args);
   2270 
   2271     // cast types
   2272     Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   2273     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   2274 
   2275     // have to do extra work for sign extending
   2276     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
   2277     {
   2278         // is this PP float?
   2279         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
   2280 
   2281         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
   2282         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   2283 
   2284         // shuffle mask
   2285         Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   2286                                       0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
   2287         Value *vi128XY_lo = nullptr;
   2288         Value *vi128XY_hi = nullptr;
   2289         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
   2290         {
   2291             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
   2292 
   2293             Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
   2294             Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
   2295 
   2296             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
   2297             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
   2298 
   2299             // after pshufb: group components together in each 128bit lane
   2300             // 256i - 0    1    2    3    4    5    6    7
   2301             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
   2302 
   2303             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
   2304             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
   2305 
   2306             // after PERMD: move and pack xy components into each 128bit lane
   2307             // 256i - 0    1    2    3    4    5    6    7
   2308             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
   2309         }
   2310 
   2311         // do the same for zw components
   2312         Value *vi128ZW_lo = nullptr;
   2313         Value *vi128ZW_hi = nullptr;
   2314         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
   2315         {
   2316             Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
   2317             Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
   2318 
   2319             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
   2320             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
   2321 
   2322             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
   2323             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
   2324         }
   2325 
   2326         // init denormalize variables if needed
   2327         Instruction::CastOps IntToFpCast;
   2328         Value *conversionFactor;
   2329 
   2330         switch (conversionType)
   2331         {
   2332         case CONVERT_NORMALIZED:
   2333             IntToFpCast = Instruction::CastOps::SIToFP;
   2334             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
   2335             break;
   2336         case CONVERT_SSCALED:
   2337             IntToFpCast = Instruction::CastOps::SIToFP;
   2338             conversionFactor = VIMMED1((float)(1.0));
   2339             break;
   2340         case CONVERT_USCALED:
   2341             SWR_INVALID("Type should not be sign extended!");
   2342             conversionFactor = nullptr;
   2343             break;
   2344         default:
   2345             SWR_ASSERT(conversionType == CONVERT_NONE);
   2346             conversionFactor = nullptr;
   2347             break;
   2348         }
   2349 
   2350         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   2351         for (uint32_t i = 0; i < 4; i++)
   2352         {
   2353             if (isComponentEnabled(compMask, i))
   2354             {
   2355                 if (compCtrl[i] == ComponentControl::StoreSrc)
   2356                 {
   2357                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   2358                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   2359                     // if x or y, use vi128XY permute result, else use vi128ZW
   2360                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
   2361                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
   2362 
   2363                     if (bFP)
   2364                     {
   2365                         // extract 128 bit lanes to sign extend each component
   2366                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
   2367                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
   2368 
   2369                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
   2370                     }
   2371                     else
   2372                     {
   2373                         // extract 128 bit lanes to sign extend each component
   2374                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
   2375                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
   2376 
   2377                         // denormalize if needed
   2378                         if (conversionType != CONVERT_NONE)
   2379                         {
   2380                             temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
   2381                             temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
   2382                         }
   2383 
   2384                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
   2385                     }
   2386 
   2387                     currentVertexElement += 1;
   2388                 }
   2389                 else
   2390                 {
   2391                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   2392                 }
   2393 
   2394                 if (currentVertexElement > 3)
   2395                 {
   2396                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
   2397                     // reset to the next vVertexElement to output
   2398                     currentVertexElement = 0;
   2399                 }
   2400             }
   2401         }
   2402     }
   2403     // else zero extend
   2404     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
   2405     {
   2406         // pshufb masks for each component
   2407         Value *vConstMask[2];
   2408 
   2409         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
   2410         {
   2411             // x/z shuffle mask
   2412             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
   2413                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
   2414         }
   2415 
   2416         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
   2417         {
   2418             // y/w shuffle mask
   2419             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
   2420                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
   2421         }
   2422 
   2423         // init denormalize variables if needed
   2424         Instruction::CastOps fpCast;
   2425         Value* conversionFactor;
   2426 
   2427         switch (conversionType)
   2428         {
   2429         case CONVERT_NORMALIZED:
   2430             fpCast = Instruction::CastOps::UIToFP;
   2431             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
   2432             break;
   2433         case CONVERT_USCALED:
   2434             fpCast = Instruction::CastOps::UIToFP;
   2435             conversionFactor = VIMMED1((float)(1.0f));
   2436             break;
   2437         case CONVERT_SSCALED:
   2438             SWR_INVALID("Type should not be zero extended!");
   2439             conversionFactor = nullptr;
   2440             break;
   2441         default:
   2442             SWR_ASSERT(conversionType == CONVERT_NONE);
   2443             conversionFactor = nullptr;
   2444             break;
   2445         }
   2446 
   2447         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
   2448         for (uint32_t i = 0; i < 4; i++)
   2449         {
   2450             if (isComponentEnabled(compMask, i))
   2451             {
   2452                 if (compCtrl[i] == ComponentControl::StoreSrc)
   2453                 {
   2454                     // select correct constMask for x/z or y/w pshufb
   2455                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
   2456                     // if x or y, use vi128XY permute result, else use vi128ZW
   2457                     uint32_t selectedGather = (i < 2) ? 0 : 1;
   2458 
   2459                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
   2460 
   2461                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
   2462                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
   2463 
   2464                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
   2465                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
   2466 
   2467                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
   2468                     // 256i - 0    1    2    3    4    5    6    7
   2469                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
   2470 
   2471                     // denormalize if needed
   2472                     if (conversionType != CONVERT_NONE)
   2473                     {
   2474                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
   2475                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
   2476                     }
   2477 
   2478                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
   2479 
   2480                     currentVertexElement += 1;
   2481                 }
   2482                 else
   2483                 {
   2484                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
   2485                 }
   2486 
   2487                 if (currentVertexElement > 3)
   2488                 {
   2489                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
   2490                     // reset to the next vVertexElement to output
   2491                     currentVertexElement = 0;
   2492                 }
   2493             }
   2494         }
   2495     }
   2496     else
   2497     {
   2498         SWR_INVALID("Unsupported conversion type");
   2499     }
   2500 }
   2501 
   2502 #else
   2503 #if USE_SIMD16_SHADERS
   2504 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
   2505 #else
   2506 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
   2507 #endif
   2508 {
   2509     // Unpack tuple args
   2510     Value* (&vGatherResult)[2] = std::get<0>(args);
   2511     Value* pVtxOut = std::get<1>(args);
   2512     const Instruction::CastOps extendType = std::get<2>(args);
   2513     const ConversionType conversionType = std::get<3>(args);
   2514     uint32_t &currentVertexElement = std::get<4>(args);
   2515     uint32_t &outputElt = std::get<5>(args);
   2516     const ComponentEnable compMask = std::get<6>(args);
   2517     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
   2518     Value* (&vVertexElements)[4] = std::get<8>(args);
   2519 
   2520     // cast types
   2521     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   2522     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   2523 
   2524                                                            // have to do extra work for sign extending
   2525     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
   2526         (extendType == Instruction::CastOps::FPExt))
   2527     {
   2528         // is this PP float?
   2529         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
   2530 
   2531         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
   2532         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   2533 
   2534                                                                                                      // shuffle mask
   2535         Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   2536             0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
   2537         Value* vi128XY = nullptr;
   2538         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
   2539             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
   2540             // after pshufb: group components together in each 128bit lane
   2541             // 256i - 0    1    2    3    4    5    6    7
   2542             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
   2543 
   2544             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
   2545             // after PERMD: move and pack xy components into each 128bit lane
   2546             // 256i - 0    1    2    3    4    5    6    7
   2547             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
   2548         }
   2549 
   2550         // do the same for zw components
   2551         Value* vi128ZW = nullptr;
   2552         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
   2553             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
   2554             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
   2555         }
   2556 
   2557         // init denormalize variables if needed
   2558         Instruction::CastOps IntToFpCast;
   2559         Value* conversionFactor;
   2560 
   2561         switch (conversionType)
   2562         {
   2563         case CONVERT_NORMALIZED:
   2564             IntToFpCast = Instruction::CastOps::SIToFP;
   2565             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
   2566             break;
   2567         case CONVERT_SSCALED:
   2568             IntToFpCast = Instruction::CastOps::SIToFP;
   2569             conversionFactor = VIMMED1((float)(1.0));
   2570             break;
   2571         case CONVERT_USCALED:
   2572             SWR_INVALID("Type should not be sign extended!");
   2573             conversionFactor = nullptr;
   2574             break;
   2575         default:
   2576             SWR_ASSERT(conversionType == CONVERT_NONE);
   2577             conversionFactor = nullptr;
   2578             break;
   2579         }
   2580 
   2581         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   2582         for (uint32_t i = 0; i < 4; i++)
   2583         {
   2584             if (isComponentEnabled(compMask, i))
   2585             {
   2586                 if (compCtrl[i] == ComponentControl::StoreSrc)
   2587                 {
   2588                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   2589                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   2590                     // if x or y, use vi128XY permute result, else use vi128ZW
   2591                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   2592 
   2593                     if (bFP) {
   2594                         // extract 128 bit lanes to sign extend each component
   2595                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
   2596                     }
   2597                     else {
   2598                         // extract 128 bit lanes to sign extend each component
   2599                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
   2600 
   2601                         // denormalize if needed
   2602                         if (conversionType != CONVERT_NONE) {
   2603                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
   2604                         }
   2605                     }
   2606                     currentVertexElement++;
   2607                 }
   2608                 else
   2609                 {
   2610 #if USE_SIMD16_SHADERS
   2611                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
   2612 #else
   2613                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   2614 #endif
   2615                 }
   2616 
   2617                 if (currentVertexElement > 3)
   2618                 {
   2619                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   2620                     // reset to the next vVertexElement to output
   2621                     currentVertexElement = 0;
   2622                 }
   2623             }
   2624         }
   2625     }
   2626     // else zero extend
   2627     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
   2628     {
   2629         // pshufb masks for each component
   2630         Value* vConstMask[2];
   2631         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
   2632             // x/z shuffle mask
   2633             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
   2634                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
   2635         }
   2636 
   2637         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
   2638             // y/w shuffle mask
   2639             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
   2640                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
   2641         }
   2642 
   2643         // init denormalize variables if needed
   2644         Instruction::CastOps fpCast;
   2645         Value* conversionFactor;
   2646 
   2647         switch (conversionType)
   2648         {
   2649         case CONVERT_NORMALIZED:
   2650             fpCast = Instruction::CastOps::UIToFP;
   2651             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
   2652             break;
   2653         case CONVERT_USCALED:
   2654             fpCast = Instruction::CastOps::UIToFP;
   2655             conversionFactor = VIMMED1((float)(1.0f));
   2656             break;
   2657         case CONVERT_SSCALED:
   2658             SWR_INVALID("Type should not be zero extended!");
   2659             conversionFactor = nullptr;
   2660             break;
   2661         default:
   2662             SWR_ASSERT(conversionType == CONVERT_NONE);
   2663             conversionFactor = nullptr;
   2664             break;
   2665         }
   2666 
   2667         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
   2668         for (uint32_t i = 0; i < 4; i++)
   2669         {
   2670             if (isComponentEnabled(compMask, i))
   2671             {
   2672                 if (compCtrl[i] == ComponentControl::StoreSrc)
   2673                 {
   2674                     // select correct constMask for x/z or y/w pshufb
   2675                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
   2676                     // if x or y, use vi128XY permute result, else use vi128ZW
   2677                     uint32_t selectedGather = (i < 2) ? 0 : 1;
   2678 
   2679                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
   2680                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
   2681                     // 256i - 0    1    2    3    4    5    6    7
   2682                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
   2683 
   2684                     // denormalize if needed
   2685                     if (conversionType != CONVERT_NONE)
   2686                     {
   2687                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
   2688                     }
   2689                     currentVertexElement++;
   2690                 }
   2691                 else
   2692                 {
   2693 #if USE_SIMD16_SHADERS
   2694                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
   2695 #else
   2696                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   2697 #endif
   2698                 }
   2699 
   2700                 if (currentVertexElement > 3)
   2701                 {
   2702                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   2703                     // reset to the next vVertexElement to output
   2704                     currentVertexElement = 0;
   2705                 }
   2706             }
   2707         }
   2708     }
   2709     else
   2710     {
   2711         SWR_INVALID("Unsupported conversion type");
   2712     }
   2713 }
   2714 
   2715 #endif
   2716 //////////////////////////////////////////////////////////////////////////
   2717 /// @brief Output a simdvertex worth of elements to the current outputElt
   2718 /// @param pVtxOut - base address of VIN output struct
   2719 /// @param outputElt - simdvertex offset in VIN to write to
   2720 /// @param numEltsToStore - number of simdvertex rows to write out
   2721 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
   2722 #if USE_SIMD16_GATHERS
   2723 void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
   2724 {
   2725     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
   2726 
   2727     for (uint32_t c = 0; c < numEltsToStore; ++c)
   2728     {
   2729         // STORE expects FP32 x vWidth type, just bitcast if needed
   2730         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
   2731         {
   2732 #if FETCH_DUMP_VERTEX
   2733             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
   2734 #endif
   2735             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
   2736         }
   2737 #if FETCH_DUMP_VERTEX
   2738         else
   2739         {
   2740             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
   2741         }
   2742 #endif
   2743         // outputElt * 4 = offsetting by the size of a simdvertex
   2744         // + c offsets to a 32bit x vWidth row within the current vertex
   2745         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
   2746         STORE(vVertexElements[c], dest);
   2747     }
   2748 }
   2749 
   2750 #else
   2751 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
   2752 {
   2753     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
   2754 
   2755     for (uint32_t c = 0; c < numEltsToStore; ++c)
   2756     {
   2757         // STORE expects FP32 x vWidth type, just bitcast if needed
   2758         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
   2759         {
   2760 #if FETCH_DUMP_VERTEX
   2761             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
   2762 #endif
   2763             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
   2764         }
   2765 #if FETCH_DUMP_VERTEX
   2766         else
   2767         {
   2768             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
   2769         }
   2770 #endif
   2771         // outputElt * 4 = offsetting by the size of a simdvertex
   2772         // + c offsets to a 32bit x vWidth row within the current vertex
   2773 #if USE_SIMD16_SHADERS
   2774         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
   2775 #else
   2776         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
   2777 #endif
   2778         STORE(vVertexElements[c], dest);
   2779     }
   2780 }
   2781 
   2782 #endif
   2783 //////////////////////////////////////////////////////////////////////////
   2784 /// @brief Generates a constant vector of values based on the
   2785 /// ComponentControl value
   2786 /// @param ctrl - ComponentControl value
   2787 #if USE_SIMD16_GATHERS
   2788 Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
   2789 {
   2790     switch (ctrl)
   2791     {
   2792         case NoStore:
   2793             return VUNDEF_I_16();
   2794         case Store0:
   2795             return VIMMED1_16(0);
   2796         case Store1Fp:
   2797             return VIMMED1_16(1.0f);
   2798         case Store1Int:
   2799             return VIMMED1_16(1);
   2800         case StoreVertexId:
   2801         {
   2802             Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID  })), mSimdFP32Ty);
   2803             Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
   2804 
   2805             Value *pId = JOIN_16(pId_lo, pId_hi);
   2806 
   2807             return pId;
   2808         }
   2809         case StoreInstanceId:
   2810         {
   2811             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
   2812             return VBROADCAST_16(pId);
   2813         }
   2814 
   2815 
   2816         case StoreSrc:
   2817         default:
   2818             SWR_INVALID("Invalid component control");
   2819             return VUNDEF_I_16();
   2820     }
   2821 }
   2822 
   2823 #else
   2824 #if USE_SIMD16_SHADERS
   2825 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
   2826 #else
   2827 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
   2828 #endif
   2829 {
   2830     switch (ctrl)
   2831     {
   2832     case NoStore:
   2833         return VUNDEF_I();
   2834     case Store0:
   2835         return VIMMED1(0);
   2836     case Store1Fp:
   2837         return VIMMED1(1.0f);
   2838     case Store1Int:
   2839         return VIMMED1(1);
   2840     case StoreVertexId:
   2841         {
   2842 #if USE_SIMD16_SHADERS
   2843             Value *pId;
   2844             if (useVertexID2)
   2845             {
   2846                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
   2847             }
   2848             else
   2849             {
   2850                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
   2851             }
   2852 #else
   2853             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
   2854 #endif
   2855             return pId;
   2856         }
   2857     case StoreInstanceId:
   2858         {
   2859             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
   2860             return VBROADCAST(pId);
   2861         }
   2862 
   2863 
   2864     case StoreSrc:
   2865     default:
   2866         SWR_INVALID("Invalid component control");
   2867         return VUNDEF_I();
   2868     }
   2869 }
   2870 
   2871 #endif
   2872 //////////////////////////////////////////////////////////////////////////
   2873 /// @brief Returns the enable mask for the specified component.
   2874 /// @param enableMask - enable bits
   2875 /// @param component - component to check if enabled.
   2876 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
   2877 {
   2878     switch (component)
   2879     {
   2880         // X
   2881     case 0: return (enableMask & ComponentEnable::X);
   2882         // Y
   2883     case 1: return (enableMask & ComponentEnable::Y);
   2884         // Z
   2885     case 2: return (enableMask & ComponentEnable::Z);
   2886         // W
   2887     case 3: return (enableMask & ComponentEnable::W);
   2888 
   2889     default: return false;
   2890     }
   2891 }
   2892 
   2893 // Don't want two threads compiling the same fetch shader simultaneously
   2894 // Has problems in the JIT cache implementation
   2895 // This is only a problem for fetch right now.
   2896 static std::mutex gFetchCodegenMutex;
   2897 
   2898 //////////////////////////////////////////////////////////////////////////
   2899 /// @brief JITs from fetch shader IR
   2900 /// @param hJitMgr - JitManager handle
   2901 /// @param func   - LLVM function IR
   2902 /// @return PFN_FETCH_FUNC - pointer to fetch code
   2903 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
   2904 {
   2905     const llvm::Function* func = (const llvm::Function*)hFunc;
   2906     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
   2907     PFN_FETCH_FUNC pfnFetch;
   2908 
   2909     gFetchCodegenMutex.lock();
   2910     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
   2911     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
   2912     pJitMgr->mIsModuleFinalized = true;
   2913 
   2914 #if defined(KNOB_SWRC_TRACING)
   2915     char fName[1024];
   2916     const char *funcName = func->getName().data();
   2917     sprintf(fName, "%s.bin", funcName);
   2918     FILE *fd = fopen(fName, "wb");
   2919     fwrite((void *)pfnFetch, 1, 2048, fd);
   2920     fclose(fd);
   2921 #endif
   2922 
   2923     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
   2924     gFetchCodegenMutex.unlock();
   2925 
   2926 
   2927 
   2928     return pfnFetch;
   2929 }
   2930 
   2931 //////////////////////////////////////////////////////////////////////////
   2932 /// @brief JIT compiles fetch shader
   2933 /// @param hJitMgr - JitManager handle
   2934 /// @param state   - fetch state to build function from
   2935 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
   2936 {
   2937     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
   2938 
   2939     pJitMgr->SetupNewModule();
   2940 
   2941     FetchJit theJit(pJitMgr);
   2942     HANDLE hFunc = theJit.Create(state);
   2943 
   2944     return JitFetchFunc(hJitMgr, hFunc);
   2945 }
   2946