Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file fetch_jit.cpp
     24 *
     25 * @brief Implementation of the fetch jitter
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "jit_api.h"
     31 #include "fetch_jit.h"
     32 #include "builder.h"
     33 #include "state_llvm.h"
     34 #include <sstream>
     35 #include <tuple>
     36 
     37 //#define FETCH_DUMP_VERTEX 1
     38 using namespace llvm;
     39 using namespace SwrJit;
     40 
     41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
     42 
     43 enum ConversionType
     44 {
     45     CONVERT_NONE,
     46     CONVERT_NORMALIZED,
     47     CONVERT_USCALED,
     48     CONVERT_SSCALED,
     49     CONVERT_SFIXED,
     50 };
     51 
     52 //////////////////////////////////////////////////////////////////////////
     53 /// Interface to Jitting a fetch shader
     54 //////////////////////////////////////////////////////////////////////////
     55 struct FetchJit : public Builder
     56 {
     57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
     58 
     59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
     60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
     61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
     62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
     63 
     64     // package up Shuffle*bpcGatherd args into a tuple for convenience
     65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
     66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
     67         const uint32_t(&)[4]> Shuffle8bpcArgs;
     68     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
     69 
     70     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
     71         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
     72     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
     73 
     74     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
     75 
     76     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
     77 
     78     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
     79     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
     80 
     81     bool IsOddFormat(SWR_FORMAT format);
     82     bool IsUniformFormat(SWR_FORMAT format);
     83     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
     84     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
     85     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
     86 
     87     Value* mpFetchInfo;
     88 };
     89 
     90 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     91 {
     92     static std::size_t fetchNum = 0;
     93 
     94     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
     95     fnName << fetchNum++;
     96 
     97     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
     98     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
     99 
    100     IRB()->SetInsertPoint(entry);
    101 
    102     auto    argitr = fetch->arg_begin();
    103 
    104     // Fetch shader arguments
    105     mpFetchInfo = &*argitr; ++argitr;
    106     mpFetchInfo->setName("fetchInfo");
    107     Value*    pVtxOut = &*argitr;
    108     pVtxOut->setName("vtxOutput");
    109     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
    110     // index 0(just the pointer to the simdvertex structure
    111     // index 1(which element of the simdvertex structure to offset to(in this case 0)
    112     // so the indices being i32's doesn't matter
    113     // TODO: generated this GEP with a VECTOR structure type so this makes sense
    114     std::vector<Value*>    vtxInputIndices(2, C(0));
    115     // GEP
    116     pVtxOut = GEP(pVtxOut, C(0));
    117     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
    118 
    119     // SWR_FETCH_CONTEXT::pStreams
    120     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
    121     streams->setName("pStreams");
    122 
    123     // SWR_FETCH_CONTEXT::pIndices
    124     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
    125     indices->setName("pIndices");
    126 
    127     // SWR_FETCH_CONTEXT::pLastIndex
    128     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
    129     pLastIndex->setName("pLastIndex");
    130 
    131 
    132     Value* vIndices;
    133     switch(fetchState.indexType)
    134     {
    135         case R8_UINT:
    136             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
    137             if(fetchState.bDisableIndexOOBCheck){
    138                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
    139                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
    140             }
    141             else{
    142                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
    143                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
    144             }
    145             break;
    146         case R16_UINT:
    147             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
    148             if(fetchState.bDisableIndexOOBCheck){
    149                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
    150                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
    151             }
    152             else{
    153                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
    154                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
    155             }
    156             break;
    157         case R32_UINT:
    158             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
    159                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
    160             break; // incoming type is already 32bit int
    161         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
    162     }
    163 
    164     Value* vVertexId = vIndices;
    165     if (fetchState.bVertexIDOffsetEnable)
    166     {
    167         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
    168         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
    169         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
    170         vVertexId = ADD(vIndices, vBaseVertex);
    171         vVertexId = ADD(vVertexId, vStartVertex);
    172     }
    173 
    174     // store out vertex IDs
    175     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
    176 
    177     // store out cut mask if enabled
    178     if (fetchState.bEnableCutIndex)
    179     {
    180         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
    181         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
    182         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
    183     }
    184 
    185     // Fetch attributes from memory and output to a simdvertex struct
    186     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
    187     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
    188                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
    189 
    190     RET_VOID();
    191 
    192     JitManager::DumpToFile(fetch, "src");
    193 
    194 #if defined(_DEBUG)
    195     verifyFunction(*fetch);
    196 #endif
    197 
    198     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
    199 
    200     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
    201     setupPasses.add(createBreakCriticalEdgesPass());
    202     setupPasses.add(createCFGSimplificationPass());
    203     setupPasses.add(createEarlyCSEPass());
    204     setupPasses.add(createPromoteMemoryToRegisterPass());
    205 
    206     setupPasses.run(*fetch);
    207 
    208     JitManager::DumpToFile(fetch, "se");
    209 
    210     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
    211 
    212     ///@todo Haven't touched these either. Need to remove some of these and add others.
    213     optPasses.add(createCFGSimplificationPass());
    214     optPasses.add(createEarlyCSEPass());
    215     optPasses.add(createInstructionCombiningPass());
    216     optPasses.add(createInstructionSimplifierPass());
    217     optPasses.add(createConstantPropagationPass());
    218     optPasses.add(createSCCPPass());
    219     optPasses.add(createAggressiveDCEPass());
    220 
    221     optPasses.run(*fetch);
    222     optPasses.run(*fetch);
    223 
    224     JitManager::DumpToFile(fetch, "opt");
    225 
    226     return fetch;
    227 }
    228 
    229 //////////////////////////////////////////////////////////////////////////
    230 /// @brief Loads attributes from memory using LOADs, shuffling the
    231 /// components into SOA form.
    232 /// *Note* currently does not support component control,
    233 /// component packing, instancing
    234 /// @param fetchState - info about attributes to be fetched from memory
    235 /// @param streams - value pointer to the current vertex stream
    236 /// @param vIndices - vector value of indices to load
    237 /// @param pVtxOut - value pointer to output simdvertex struct
    238 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
    239 {
    240     // Zack shuffles; a variant of the Charleston.
    241 
    242     std::vector<Value*> vectors(16);
    243     std::vector<Constant*>    pMask(mVWidth);
    244     for(uint32_t i = 0; i < mVWidth; ++i)
    245     {
    246         pMask[i] = (C(i < 4 ? i : 4));
    247     }
    248     Constant* promoteMask = ConstantVector::get(pMask);
    249     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
    250 
    251     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
    252     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
    253     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
    254     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
    255     curInstance->setName("curInstance");
    256 
    257     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
    258     {
    259         Value*    elements[4] = {0};
    260         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
    261         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
    262         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
    263         uint32_t    numComponents = info.numComps;
    264         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
    265 
    266         // load path doesn't support component packing
    267         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
    268 
    269         vectors.clear();
    270 
    271         Value *vCurIndices;
    272         Value *startOffset;
    273         if(ied.InstanceEnable)
    274         {
    275             Value* stepRate = C(ied.InstanceDataStepRate);
    276 
    277             // prevent a div by 0 for 0 step rate
    278             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
    279             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
    280 
    281             // calc the current offset into instanced data buffer
    282             Value* calcInstance = UDIV(curInstance, stepRate);
    283 
    284             // if step rate is 0, every instance gets instance 0
    285             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
    286 
    287             vCurIndices = VBROADCAST(calcInstance);
    288 
    289             startOffset = startInstance;
    290         }
    291         else
    292         {
    293             // offset indices by baseVertex
    294             vCurIndices = ADD(vIndices, vBaseVertex);
    295 
    296             startOffset = startVertex;
    297         }
    298 
    299         // load SWR_VERTEX_BUFFER_STATE::pData
    300         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
    301 
    302         // load SWR_VERTEX_BUFFER_STATE::pitch
    303         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
    304         stride = Z_EXT(stride, mInt64Ty);
    305 
    306         // load SWR_VERTEX_BUFFER_STATE::size
    307         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
    308         size = Z_EXT(size, mInt64Ty);
    309 
    310         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
    311 
    312         // Load from the stream.
    313         for(uint32_t lane = 0; lane < mVWidth; ++lane)
    314         {
    315             // Get index
    316             Value* index = VEXTRACT(vCurIndices, C(lane));
    317             index = Z_EXT(index, mInt64Ty);
    318 
    319             Value*    offset = MUL(index, stride);
    320             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
    321             offset = ADD(offset, startVertexOffset);
    322 
    323             if (!fetchState.bDisableIndexOOBCheck) {
    324                 // check for out of bound access, including partial OOB, and mask them to 0
    325                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
    326                 Value *oob = ICMP_ULE(endOffset, size);
    327                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
    328             }
    329 
    330             Value*    pointer = GEP(stream, offset);
    331             // We use a full-lane, but don't actually care.
    332             Value*    vptr = 0;
    333 
    334             // get a pointer to a 4 component attrib in default address space
    335             switch(bpc)
    336             {
    337                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
    338                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
    339                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
    340                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
    341             }
    342 
    343             // load 4 components of attribute
    344             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
    345 
    346             // Convert To FP32 internally
    347             switch(info.type[0])
    348             {
    349                 case SWR_TYPE_UNORM:
    350                     switch(bpc)
    351                     {
    352                         case 8:
    353                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    354                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
    355                             break;
    356                         case 16:
    357                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    358                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
    359                             break;
    360                         default:
    361                             SWR_ASSERT(false, "Unsupported underlying type!");
    362                             break;
    363                     }
    364                     break;
    365                 case SWR_TYPE_SNORM:
    366                     switch(bpc)
    367                     {
    368                         case 8:
    369                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    370                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
    371                             break;
    372                         case 16:
    373                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    374                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
    375                             break;
    376                         default:
    377                             SWR_ASSERT(false, "Unsupported underlying type!");
    378                             break;
    379                     }
    380                     break;
    381                 case SWR_TYPE_UINT:
    382                     // Zero extend uint32_t types.
    383                     switch(bpc)
    384                     {
    385                         case 8:
    386                         case 16:
    387                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
    388                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
    389                             break;
    390                         case 32:
    391                             break; // Pass through unchanged.
    392                         default:
    393                             SWR_ASSERT(false, "Unsupported underlying type!");
    394                             break;
    395                     }
    396                     break;
    397                 case SWR_TYPE_SINT:
    398                     // Sign extend SINT types.
    399                     switch(bpc)
    400                     {
    401                         case 8:
    402                         case 16:
    403                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
    404                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
    405                             break;
    406                         case 32:
    407                             break; // Pass through unchanged.
    408                         default:
    409                             SWR_ASSERT(false, "Unsupported underlying type!");
    410                             break;
    411                     }
    412                     break;
    413                 case SWR_TYPE_FLOAT:
    414                     switch(bpc)
    415                     {
    416                         case 32:
    417                             break; // Pass through unchanged.
    418                         default:
    419                             SWR_ASSERT(false, "Unsupported underlying type!");
    420                     }
    421                     break;
    422                 case SWR_TYPE_USCALED:
    423                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    424                     break;
    425                 case SWR_TYPE_SSCALED:
    426                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
    427                     break;
    428                 case SWR_TYPE_SFIXED:
    429                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
    430                     break;
    431                 case SWR_TYPE_UNKNOWN:
    432                 case SWR_TYPE_UNUSED:
    433                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
    434             }
    435 
    436             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
    437             // uwvec: 4 x F32, undef value
    438             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
    439             vectors.push_back(wvec);
    440         }
    441 
    442         std::vector<Constant*>        v01Mask(mVWidth);
    443         std::vector<Constant*>        v23Mask(mVWidth);
    444         std::vector<Constant*>        v02Mask(mVWidth);
    445         std::vector<Constant*>        v13Mask(mVWidth);
    446 
    447         // Concatenate the vectors together.
    448         elements[0] = VUNDEF_F();
    449         elements[1] = VUNDEF_F();
    450         elements[2] = VUNDEF_F();
    451         elements[3] = VUNDEF_F();
    452         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
    453         {
    454             v01Mask[4 * b + 0] = C(0 + 4 * b);
    455             v01Mask[4 * b + 1] = C(1 + 4 * b);
    456             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
    457             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
    458 
    459             v23Mask[4 * b + 0] = C(2 + 4 * b);
    460             v23Mask[4 * b + 1] = C(3 + 4 * b);
    461             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
    462             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
    463 
    464             v02Mask[4 * b + 0] = C(0 + 4 * b);
    465             v02Mask[4 * b + 1] = C(2 + 4 * b);
    466             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
    467             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
    468 
    469             v13Mask[4 * b + 0] = C(1 + 4 * b);
    470             v13Mask[4 * b + 1] = C(3 + 4 * b);
    471             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
    472             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
    473 
    474             std::vector<Constant*>    iMask(mVWidth);
    475             for(uint32_t i = 0; i < mVWidth; ++i)
    476             {
    477                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
    478                 {
    479                     iMask[i] = C(i % 4 + mVWidth);
    480                 }
    481                 else
    482                 {
    483                     iMask[i] = C(i);
    484                 }
    485             }
    486             Constant* insertMask = ConstantVector::get(iMask);
    487             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
    488             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
    489             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
    490             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
    491         }
    492 
    493         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
    494         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
    495         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
    496         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
    497         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
    498         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
    499         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
    500         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
    501 
    502         switch(numComponents + 1)
    503         {
    504             case    1: elements[0] = VIMMED1(0.0f);
    505             case    2: elements[1] = VIMMED1(0.0f);
    506             case    3: elements[2] = VIMMED1(0.0f);
    507             case    4: elements[3] = VIMMED1(1.0f);
    508         }
    509 
    510         for(uint32_t c = 0; c < 4; ++c)
    511         {
    512             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
    513             STORE(elements[c], dest);
    514         }
    515     }
    516 }
    517 
    518 // returns true for odd formats that require special state.gather handling
    519 bool FetchJit::IsOddFormat(SWR_FORMAT format)
    520 {
    521     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    522     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
    523     {
    524         return true;
    525     }
    526     return false;
    527 }
    528 
    529 // format is uniform if all components are the same size and type
    530 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
    531 {
    532     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    533     uint32_t bpc0 = info.bpc[0];
    534     uint32_t type0 = info.type[0];
    535 
    536     for (uint32_t c = 1; c < info.numComps; ++c)
    537     {
    538         if (bpc0 != info.bpc[c] || type0 != info.type[c])
    539         {
    540             return false;
    541         }
    542     }
    543     return true;
    544 }
    545 
    546 // unpacks components based on format
    547 // foreach component in the pixel
    548 //   mask off everything but this component
    549 //   shift component to LSB
    550 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
    551 {
    552     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    553 
    554     uint32_t bitOffset = 0;
    555     for (uint32_t c = 0; c < info.numComps; ++c)
    556     {
    557         uint32_t swizzledIndex = info.swizzle[c];
    558         uint32_t compBits = info.bpc[c];
    559         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
    560         Value* comp = AND(vInput, bitmask);
    561         comp = LSHR(comp, bitOffset);
    562 
    563         result[swizzledIndex] = comp;
    564         bitOffset += compBits;
    565     }
    566 }
    567 
    568 // gather for odd component size formats
    569 // gather SIMD full pixels per lane then shift/mask to move each component to their
    570 // own vector
    571 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4])
    572 {
    573     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
    574 
    575     // only works if pixel size is <= 32bits
    576     SWR_ASSERT(info.bpp <= 32);
    577 
    578     Value* gather = VUNDEF_I();
    579 
    580     // assign defaults
    581     for (uint32_t comp = 0; comp < 4; ++comp)
    582     {
    583         result[comp] = VIMMED1((int)info.defaults[comp]);
    584     }
    585 
    586     // load the proper amount of data based on component size
    587     PointerType* pLoadTy = nullptr;
    588     switch (info.bpp)
    589     {
    590     case 8: pLoadTy = Type::getInt8PtrTy(JM()->mContext); break;
    591     case 16: pLoadTy = Type::getInt16PtrTy(JM()->mContext); break;
    592     case 24:
    593     case 32: pLoadTy = Type::getInt32PtrTy(JM()->mContext); break;
    594     default: SWR_ASSERT(0);
    595     }
    596 
    597     // allocate temporary memory for masked off lanes
    598     Value* pTmp = ALLOCA(pLoadTy->getElementType());
    599 
    600     // gather SIMD pixels
    601     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
    602     {
    603         Value* pElemOffset = VEXTRACT(offsets, C(e));
    604         Value* pLoad = GEP(pBase, pElemOffset);
    605         Value* pLaneMask = VEXTRACT(pMask, C(e));
    606 
    607         pLoad = POINTER_CAST(pLoad, pLoadTy);
    608 
    609         // mask in tmp pointer for disabled lanes
    610         pLoad = SELECT(pLaneMask, pLoad, pTmp);
    611 
    612         // load pixel
    613         Value *val = LOAD(pLoad);
    614 
    615         // zero extend to 32bit integer
    616         val = INT_CAST(val, mInt32Ty, false);
    617 
    618         // store in simd lane
    619         gather = VINSERT(gather, val, C(e));
    620     }
    621 
    622     UnpackComponents(format, gather, result);
    623 
    624     // cast to fp32
    625     result[0] = BITCAST(result[0], mSimdFP32Ty);
    626     result[1] = BITCAST(result[1], mSimdFP32Ty);
    627     result[2] = BITCAST(result[2], mSimdFP32Ty);
    628     result[3] = BITCAST(result[3], mSimdFP32Ty);
    629 }
    630 
    631 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
    632 {
    633     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
    634 
    635     for (uint32_t c = 0; c < info.numComps; ++c)
    636     {
    637         uint32_t compIndex = info.swizzle[c];
    638 
    639         // skip any conversion on UNUSED components
    640         if (info.type[c] == SWR_TYPE_UNUSED)
    641         {
    642             continue;
    643         }
    644 
    645         if (info.isNormalized[c])
    646         {
    647             if (info.type[c] == SWR_TYPE_SNORM)
    648             {
    649                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
    650 
    651                 /// result = c * (1.0f / (2^(n-1) - 1);
    652                 uint32_t n = info.bpc[c];
    653                 uint32_t pow2 = 1 << (n - 1);
    654                 float scale = 1.0f / (float)(pow2 - 1);
    655                 Value *vScale = VIMMED1(scale);
    656                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
    657                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
    658                 texels[compIndex] = FMUL(texels[compIndex], vScale);
    659             }
    660             else
    661             {
    662                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
    663 
    664                 /// result = c * (1.0f / (2^n - 1))
    665                 uint32_t n = info.bpc[c];
    666                 uint32_t pow2 = 1 << n;
    667                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
    668                 if (n == 24)
    669                 {
    670                     float scale = (float)(pow2 - 1);
    671                     Value* vScale = VIMMED1(scale);
    672                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
    673                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
    674                     texels[compIndex] = FDIV(texels[compIndex], vScale);
    675                 }
    676                 else
    677                 {
    678                     float scale = 1.0f / (float)(pow2 - 1);
    679                     Value *vScale = VIMMED1(scale);
    680                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
    681                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
    682                     texels[compIndex] = FMUL(texels[compIndex], vScale);
    683                 }
    684             }
    685             continue;
    686         }
    687     }
    688 }
    689 
    690 //////////////////////////////////////////////////////////////////////////
    691 /// @brief Loads attributes from memory using AVX2 GATHER(s)
    692 /// @param fetchState - info about attributes to be fetched from memory
    693 /// @param streams - value pointer to the current vertex stream
    694 /// @param vIndices - vector value of indices to gather
    695 /// @param pVtxOut - value pointer to output simdvertex struct
    696 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
    697                                  Value* streams, Value* vIndices, Value* pVtxOut)
    698 {
    699     uint32_t currentVertexElement = 0;
    700     uint32_t outputElt = 0;
    701     Value* vVertexElements[4];
    702 
    703     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
    704     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
    705     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
    706     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
    707     curInstance->setName("curInstance");
    708 
    709     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
    710     {
    711         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
    712 
    713         // skip element if all components are disabled
    714         if (ied.ComponentPacking == ComponentEnable::NONE)
    715         {
    716             continue;
    717         }
    718 
    719         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
    720         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
    721         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
    722 
    723         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
    724 
    725         // VGATHER* takes an *i8 src pointer
    726         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
    727 
    728         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
    729         Value *vStride = VBROADCAST(stride);
    730 
    731         // max vertex index that is fully in bounds
    732         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
    733         maxVertex = LOAD(maxVertex);
    734 
    735         Value *vCurIndices;
    736         Value *startOffset;
    737         if(ied.InstanceEnable)
    738         {
    739             Value* stepRate = C(ied.InstanceDataStepRate);
    740 
    741             // prevent a div by 0 for 0 step rate
    742             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
    743             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
    744 
    745             // calc the current offset into instanced data buffer
    746             Value* calcInstance = UDIV(curInstance, stepRate);
    747 
    748             // if step rate is 0, every instance gets instance 0
    749             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
    750 
    751             vCurIndices = VBROADCAST(calcInstance);
    752 
    753             startOffset = startInstance;
    754         }
    755         else
    756         {
    757             // offset indices by baseVertex
    758             vCurIndices = ADD(vIndices, vBaseVertex);
    759 
    760             startOffset = startVertex;
    761         }
    762 
    763         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
    764         // do 64bit address offset calculations.
    765 
    766         // calculate byte offset to the start of the VB
    767         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
    768         pStreamBase = GEP(pStreamBase, baseOffset);
    769 
    770         // if we have a start offset, subtract from max vertex. Used for OOB check
    771         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
    772         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
    773         // if we have a negative value, we're already OOB. clamp at 0.
    774         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
    775 
    776         // Load the in bounds size of a partially valid vertex
    777         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
    778         partialInboundsSize = LOAD(partialInboundsSize);
    779         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
    780         Value* vBpp = VBROADCAST(C(info.Bpp));
    781         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
    782 
    783         // is the element is <= the partially valid size
    784         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
    785 
    786         // override cur indices with 0 if pitch is 0
    787         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
    788         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
    789 
    790         // are vertices partially OOB?
    791         Value* vMaxVertex = VBROADCAST(maxVertex);
    792         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
    793 
    794         // are vertices are fully in bounds?
    795         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
    796 
    797         // blend in any partially OOB indices that have valid elements
    798         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
    799         Value* pMask = vGatherMask;
    800         vGatherMask = VMASK(vGatherMask);
    801 
    802         // calculate the actual offsets into the VB
    803         Value* vOffsets = MUL(vCurIndices, vStride);
    804         vOffsets = ADD(vOffsets, vAlignmentOffsets);
    805 
    806         // Packing and component control
    807         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
    808         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
    809                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
    810 
    811         // Special gather/conversion for formats without equal component sizes
    812         if (IsOddFormat((SWR_FORMAT)ied.Format))
    813         {
    814             Value* pResults[4];
    815             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pMask, pStreamBase, vOffsets, pResults);
    816             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
    817 
    818             for (uint32_t c = 0; c < 4; ++c)
    819             {
    820                 if (isComponentEnabled(compMask, c))
    821                 {
    822                     vVertexElements[currentVertexElement++] = pResults[c];
    823                     if (currentVertexElement > 3)
    824                     {
    825                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
    826                         // reset to the next vVertexElement to output
    827                         currentVertexElement = 0;
    828                     }
    829                 }
    830             }
    831         }
    832         else if(info.type[0] == SWR_TYPE_FLOAT)
    833         {
    834             ///@todo: support 64 bit vb accesses
    835             Value* gatherSrc = VIMMED1(0.0f);
    836 
    837             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
    838                 "Unsupported format for standard gather fetch.");
    839 
    840             // Gather components from memory to store in a simdvertex structure
    841             switch(bpc)
    842             {
    843                 case 16:
    844                 {
    845                     Value* vGatherResult[2];
    846                     Value *vMask;
    847 
    848                     // if we have at least one component out of x or y to fetch
    849                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
    850                         // save mask as it is zero'd out after each gather
    851                         vMask = vGatherMask;
    852 
    853                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
    854                         // e.g. result of first 8x32bit integer gather for 16bit components
    855                         // 256i - 0    1    2    3    4    5    6    7
    856                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
    857                         //
    858                     }
    859 
    860                     // if we have at least one component out of z or w to fetch
    861                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
    862                         // offset base to the next components(zw) in the vertex to gather
    863                         pStreamBase = GEP(pStreamBase, C((char)4));
    864                         vMask = vGatherMask;
    865 
    866                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
    867                         // e.g. result of second 8x32bit integer gather for 16bit components
    868                         // 256i - 0    1    2    3    4    5    6    7
    869                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
    870                         //
    871                     }
    872 
    873                     // if we have at least one component to shuffle into place
    874                     if(compMask){
    875                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
    876                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
    877 
    878                         // Shuffle gathered components into place in simdvertex struct
    879                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
    880                     }
    881                 }
    882                     break;
    883                 case 32:
    884                 {
    885                     for (uint32_t i = 0; i < 4; i++)
    886                     {
    887                         if (isComponentEnabled(compMask, i))
    888                         {
    889                             // if we need to gather the component
    890                             if (compCtrl[i] == StoreSrc)
    891                             {
    892                                 // save mask as it is zero'd out after each gather
    893                                 Value *vMask = vGatherMask;
    894 
    895                                 // Gather a SIMD of vertices
    896                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
    897                             }
    898                             else
    899                             {
    900                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
    901                             }
    902 
    903                             if (currentVertexElement > 3)
    904                             {
    905                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
    906                                 // reset to the next vVertexElement to output
    907                                 currentVertexElement = 0;
    908                             }
    909 
    910                         }
    911 
    912                         // offset base to the next component in the vertex to gather
    913                         pStreamBase = GEP(pStreamBase, C((char)4));
    914                     }
    915                 }
    916                     break;
    917                 case 64:
    918                 {
    919                     for (uint32_t i = 0; i < 4; i++)
    920                     {
    921                         if (isComponentEnabled(compMask, i))
    922                         {
    923                             // if we need to gather the component
    924                             if (compCtrl[i] == StoreSrc)
    925                             {
    926                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
    927                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
    928                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
    929                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
    930                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
    931                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
    932 
    933                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
    934                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
    935 
    936                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
    937 
    938                                 Value* pGatherLo = GATHERPD(vZeroDouble,
    939                                                             pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
    940                                 Value* pGatherHi = GATHERPD(vZeroDouble,
    941                                                             pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
    942 
    943                                 pGatherLo = VCVTPD2PS(pGatherLo);
    944                                 pGatherHi = VCVTPD2PS(pGatherHi);
    945 
    946                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
    947 
    948                                 vVertexElements[currentVertexElement++] = pGather;
    949                             }
    950                             else
    951                             {
    952                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
    953                             }
    954 
    955                             if (currentVertexElement > 3)
    956                             {
    957                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
    958                                 // reset to the next vVertexElement to output
    959                                 currentVertexElement = 0;
    960                             }
    961 
    962                         }
    963 
    964                         // offset base to the next component  in the vertex to gather
    965                         pStreamBase = GEP(pStreamBase, C((char)8));
    966                     }
    967                 }
    968                     break;
    969                 default:
    970                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
    971                     break;
    972             }
    973         }
    974         else
    975         {
    976             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
    977             ConversionType conversionType = CONVERT_NONE;
    978 
    979             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
    980                 "Unsupported format for standard gather fetch.");
    981 
    982             switch(info.type[0])
    983             {
    984                 case SWR_TYPE_UNORM:
    985                     conversionType = CONVERT_NORMALIZED;
    986                 case SWR_TYPE_UINT:
    987                     extendCastType = Instruction::CastOps::ZExt;
    988                     break;
    989                 case SWR_TYPE_SNORM:
    990                     conversionType = CONVERT_NORMALIZED;
    991                 case SWR_TYPE_SINT:
    992                     extendCastType = Instruction::CastOps::SExt;
    993                     break;
    994                 case SWR_TYPE_USCALED:
    995                     conversionType = CONVERT_USCALED;
    996                     extendCastType = Instruction::CastOps::UIToFP;
    997                     break;
    998                 case SWR_TYPE_SSCALED:
    999                     conversionType = CONVERT_SSCALED;
   1000                     extendCastType = Instruction::CastOps::SIToFP;
   1001                     break;
   1002                 case SWR_TYPE_SFIXED:
   1003                     conversionType = CONVERT_SFIXED;
   1004                     extendCastType = Instruction::CastOps::SExt;
   1005                     break;
   1006                 default:
   1007                     break;
   1008             }
   1009 
   1010             // value substituted when component of gather is masked
   1011             Value* gatherSrc = VIMMED1(0);
   1012 
   1013             // Gather components from memory to store in a simdvertex structure
   1014             switch (bpc)
   1015             {
   1016                 case 8:
   1017                 {
   1018                     // if we have at least one component to fetch
   1019                     if(compMask)
   1020                     {
   1021                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
   1022                         // e.g. result of an 8x32bit integer gather for 8bit components
   1023                         // 256i - 0    1    2    3    4    5    6    7
   1024                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
   1025 
   1026                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
   1027                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
   1028 
   1029                         // Shuffle gathered components into place in simdvertex struct
   1030                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
   1031                     }
   1032                 }
   1033                 break;
   1034                 case 16:
   1035                 {
   1036                     Value* vGatherResult[2];
   1037                     Value *vMask;
   1038 
   1039                     // if we have at least one component out of x or y to fetch
   1040                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
   1041                         // save mask as it is zero'd out after each gather
   1042                         vMask = vGatherMask;
   1043 
   1044                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
   1045                         // e.g. result of first 8x32bit integer gather for 16bit components
   1046                         // 256i - 0    1    2    3    4    5    6    7
   1047                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
   1048                         //
   1049                     }
   1050 
   1051                     // if we have at least one component out of z or w to fetch
   1052                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
   1053                         // offset base to the next components(zw) in the vertex to gather
   1054                         pStreamBase = GEP(pStreamBase, C((char)4));
   1055                         vMask = vGatherMask;
   1056 
   1057                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
   1058                         // e.g. result of second 8x32bit integer gather for 16bit components
   1059                         // 256i - 0    1    2    3    4    5    6    7
   1060                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
   1061                         //
   1062                     }
   1063 
   1064                     // if we have at least one component to shuffle into place
   1065                     if(compMask){
   1066                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
   1067                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
   1068 
   1069                         // Shuffle gathered components into place in simdvertex struct
   1070                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
   1071                     }
   1072                 }
   1073                 break;
   1074                 case 32:
   1075                 {
   1076                     // Gathered components into place in simdvertex struct
   1077                     for (uint32_t i = 0; i < 4; i++)
   1078                     {
   1079                         if (isComponentEnabled(compMask, i))
   1080                         {
   1081                             // if we need to gather the component
   1082                             if (compCtrl[i] == StoreSrc)
   1083                             {
   1084                                 // save mask as it is zero'd out after each gather
   1085                                 Value *vMask = vGatherMask;
   1086 
   1087                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
   1088 
   1089                                 if (conversionType == CONVERT_USCALED)
   1090                                 {
   1091                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
   1092                                 }
   1093                                 else if (conversionType == CONVERT_SSCALED)
   1094                                 {
   1095                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
   1096                                 }
   1097                                 else if (conversionType == CONVERT_SFIXED)
   1098                                 {
   1099                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
   1100                                 }
   1101 
   1102                                 vVertexElements[currentVertexElement++] = pGather;
   1103                                 // e.g. result of a single 8x32bit integer gather for 32bit components
   1104                                 // 256i - 0    1    2    3    4    5    6    7
   1105                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
   1106                             }
   1107                             else
   1108                             {
   1109                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1110                             }
   1111 
   1112                             if (currentVertexElement > 3)
   1113                             {
   1114                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1115                                 // reset to the next vVertexElement to output
   1116                                 currentVertexElement = 0;
   1117                             }
   1118 
   1119                         }
   1120 
   1121                         // offset base to the next component  in the vertex to gather
   1122                         pStreamBase = GEP(pStreamBase, C((char)4));
   1123                     }
   1124                 }
   1125                 break;
   1126             }
   1127         }
   1128     }
   1129 
   1130     // if we have a partially filled vVertexElement struct, output it
   1131     if(currentVertexElement > 0){
   1132         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
   1133     }
   1134 }
   1135 
   1136 //////////////////////////////////////////////////////////////////////////
   1137 /// @brief Loads a simd of valid indices. OOB indices are set to 0
   1138 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
   1139 /// support
   1140 /// @param pIndices - pointer to 8 bit indices
   1141 /// @param pLastIndex - pointer to last valid index
   1142 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
   1143 {
   1144     // can fit 2 16 bit integers per vWidth lane
   1145     Value* vIndices =  VUNDEF_I();
   1146 
   1147     // store 0 index on stack to be used to conditionally load from if index address is OOB
   1148     Value* pZeroIndex = ALLOCA(mInt8Ty);
   1149     STORE(C((uint8_t)0), pZeroIndex);
   1150 
   1151     // Load a SIMD of index pointers
   1152     for(int64_t lane = 0; lane < mVWidth; lane++)
   1153     {
   1154         // Calculate the address of the requested index
   1155         Value *pIndex = GEP(pIndices, C(lane));
   1156 
   1157         // check if the address is less than the max index,
   1158         Value* mask = ICMP_ULT(pIndex, pLastIndex);
   1159 
   1160         // if valid, load the index. if not, load 0 from the stack
   1161         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
   1162         Value *index = LOAD(pValid, "valid index");
   1163 
   1164         // zero extended index to 32 bits and insert into the correct simd lane
   1165         index = Z_EXT(index, mInt32Ty);
   1166         vIndices = VINSERT(vIndices, index, lane);
   1167     }
   1168     return vIndices;
   1169 }
   1170 
   1171 //////////////////////////////////////////////////////////////////////////
   1172 /// @brief Loads a simd of valid indices. OOB indices are set to 0
   1173 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
   1174 /// support
   1175 /// @param pIndices - pointer to 16 bit indices
   1176 /// @param pLastIndex - pointer to last valid index
   1177 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
   1178 {
   1179     // can fit 2 16 bit integers per vWidth lane
   1180     Value* vIndices =  VUNDEF_I();
   1181 
   1182     // store 0 index on stack to be used to conditionally load from if index address is OOB
   1183     Value* pZeroIndex = ALLOCA(mInt16Ty);
   1184     STORE(C((uint16_t)0), pZeroIndex);
   1185 
   1186     // Load a SIMD of index pointers
   1187     for(int64_t lane = 0; lane < mVWidth; lane++)
   1188     {
   1189         // Calculate the address of the requested index
   1190         Value *pIndex = GEP(pIndices, C(lane));
   1191 
   1192         // check if the address is less than the max index,
   1193         Value* mask = ICMP_ULT(pIndex, pLastIndex);
   1194 
   1195         // if valid, load the index. if not, load 0 from the stack
   1196         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
   1197         Value *index = LOAD(pValid, "valid index");
   1198 
   1199         // zero extended index to 32 bits and insert into the correct simd lane
   1200         index = Z_EXT(index, mInt32Ty);
   1201         vIndices = VINSERT(vIndices, index, lane);
   1202     }
   1203     return vIndices;
   1204 }
   1205 
   1206 //////////////////////////////////////////////////////////////////////////
   1207 /// @brief Loads a simd of valid indices. OOB indices are set to 0
   1208 /// @param pIndices - pointer to 32 bit indices
   1209 /// @param pLastIndex - pointer to last valid index
   1210 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
   1211 {
   1212     DataLayout dL(JM()->mpCurrentModule);
   1213     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
   1214     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
   1215     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
   1216 
   1217     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
   1218     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
   1219     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
   1220     numIndicesLeft = SDIV(numIndicesLeft, C(4));
   1221 
   1222     // create a vector of index counts from the base index ptr passed into the fetch
   1223     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
   1224     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
   1225 
   1226     // compare index count to the max valid index
   1227     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
   1228     //     vIndexOffsets  0 1 2 3 4 5 6 7
   1229     //     ------------------------------
   1230     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
   1231     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
   1232     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
   1233     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
   1234 
   1235     // VMASKLOAD takes an *i8 src pointer
   1236     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
   1237 
   1238     // Load the indices; OOB loads 0
   1239     return MASKLOADD(pIndices,vIndexMask);
   1240 }
   1241 
   1242 //////////////////////////////////////////////////////////////////////////
   1243 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
   1244 /// denormalizes if needed, converts to F32 if needed, and positions in
   1245 //  the proper SIMD rows to be output to the simdvertex structure
   1246 /// @param args: (tuple of args, listed below)
   1247 ///   @param vGatherResult - 8 gathered 8bpc vertices
   1248 ///   @param pVtxOut - base pointer to output simdvertex struct
   1249 ///   @param extendType - sign extend or zero extend
   1250 ///   @param bNormalized - do we need to denormalize?
   1251 ///   @param currentVertexElement - reference to the current vVertexElement
   1252 ///   @param outputElt - reference to the current offset from simdvertex we're o
   1253 ///   @param compMask - component packing mask
   1254 ///   @param compCtrl - component control val
   1255 ///   @param vVertexElements[4] - vertex components to output
   1256 ///   @param swizzle[4] - component swizzle location
   1257 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
   1258 {
   1259     // Unpack tuple args
   1260     Value*& vGatherResult = std::get<0>(args);
   1261     Value* pVtxOut = std::get<1>(args);
   1262     const Instruction::CastOps extendType = std::get<2>(args);
   1263     const ConversionType conversionType = std::get<3>(args);
   1264     uint32_t &currentVertexElement = std::get<4>(args);
   1265     uint32_t &outputElt =  std::get<5>(args);
   1266     const ComponentEnable compMask = std::get<6>(args);
   1267     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
   1268     Value* (&vVertexElements)[4] = std::get<8>(args);
   1269     const uint32_t (&swizzle)[4] = std::get<9>(args);
   1270 
   1271     // cast types
   1272     Type* vGatherTy = mSimdInt32Ty;
   1273     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
   1274 
   1275     // have to do extra work for sign extending
   1276     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
   1277         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
   1278         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1279 
   1280         // shuffle mask, including any swizzling
   1281         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
   1282         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
   1283         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
   1284                     char(y), char(y+4), char(y+8), char(y+12),
   1285                     char(z), char(z+4), char(z+8), char(z+12),
   1286                     char(w), char(w+4), char(w+8), char(w+12),
   1287                     char(x), char(x+4), char(x+8), char(x+12),
   1288                     char(y), char(y+4), char(y+8), char(y+12),
   1289                     char(z), char(z+4), char(z+8), char(z+12),
   1290                     char(w), char(w+4), char(w+8), char(w+12)});
   1291 
   1292         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
   1293         // after pshufb: group components together in each 128bit lane
   1294         // 256i - 0    1    2    3    4    5    6    7
   1295         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
   1296 
   1297         Value* vi128XY = nullptr;
   1298         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
   1299             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
   1300             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
   1301             // 256i - 0    1    2    3    4    5    6    7
   1302             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
   1303         }
   1304 
   1305         // do the same for zw components
   1306         Value* vi128ZW = nullptr;
   1307         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
   1308             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
   1309         }
   1310 
   1311         // init denormalize variables if needed
   1312         Instruction::CastOps fpCast;
   1313         Value* conversionFactor;
   1314 
   1315         switch (conversionType)
   1316         {
   1317         case CONVERT_NORMALIZED:
   1318             fpCast = Instruction::CastOps::SIToFP;
   1319             conversionFactor = VIMMED1((float)(1.0 / 127.0));
   1320             break;
   1321         case CONVERT_SSCALED:
   1322             fpCast = Instruction::CastOps::SIToFP;
   1323             conversionFactor = VIMMED1((float)(1.0));
   1324             break;
   1325         case CONVERT_USCALED:
   1326             SWR_ASSERT(0, "Type should not be sign extended!");
   1327             conversionFactor = nullptr;
   1328             break;
   1329         default:
   1330             SWR_ASSERT(conversionType == CONVERT_NONE);
   1331             conversionFactor = nullptr;
   1332             break;
   1333         }
   1334 
   1335         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   1336         for (uint32_t i = 0; i < 4; i++)
   1337         {
   1338             if (isComponentEnabled(compMask, i))
   1339             {
   1340                 if (compCtrl[i] == ComponentControl::StoreSrc)
   1341                 {
   1342                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   1343                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   1344                     // if x or y, use vi128XY permute result, else use vi128ZW
   1345                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   1346 
   1347                     // sign extend
   1348                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
   1349 
   1350                     // denormalize if needed
   1351                     if (conversionType != CONVERT_NONE)
   1352                     {
   1353                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
   1354                     }
   1355                     currentVertexElement++;
   1356                 }
   1357                 else
   1358                 {
   1359                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1360                 }
   1361 
   1362                 if (currentVertexElement > 3)
   1363                 {
   1364                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1365                     // reset to the next vVertexElement to output
   1366                     currentVertexElement = 0;
   1367                 }
   1368             }
   1369         }
   1370     }
   1371     // else zero extend
   1372     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
   1373     {
   1374         // init denormalize variables if needed
   1375         Instruction::CastOps fpCast;
   1376         Value* conversionFactor;
   1377 
   1378         switch (conversionType)
   1379         {
   1380         case CONVERT_NORMALIZED:
   1381             fpCast = Instruction::CastOps::UIToFP;
   1382             conversionFactor = VIMMED1((float)(1.0 / 255.0));
   1383             break;
   1384         case CONVERT_USCALED:
   1385             fpCast = Instruction::CastOps::UIToFP;
   1386             conversionFactor = VIMMED1((float)(1.0));
   1387             break;
   1388         case CONVERT_SSCALED:
   1389             SWR_ASSERT(0, "Type should not be zero extended!");
   1390             conversionFactor = nullptr;
   1391             break;
   1392         default:
   1393             SWR_ASSERT(conversionType == CONVERT_NONE);
   1394             conversionFactor = nullptr;
   1395             break;
   1396         }
   1397 
   1398         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
   1399         for (uint32_t i = 0; i < 4; i++)
   1400         {
   1401             if (isComponentEnabled(compMask, i))
   1402             {
   1403                 if (compCtrl[i] == ComponentControl::StoreSrc)
   1404                 {
   1405                     // pshufb masks for each component
   1406                     Value* vConstMask;
   1407                     switch (swizzle[i])
   1408                     {
   1409                     case 0:
   1410                         // x shuffle mask
   1411                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
   1412                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
   1413                         break;
   1414                     case 1:
   1415                         // y shuffle mask
   1416                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
   1417                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
   1418                         break;
   1419                     case 2:
   1420                         // z shuffle mask
   1421                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
   1422                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
   1423                         break;
   1424                     case 3:
   1425                         // w shuffle mask
   1426                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
   1427                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
   1428                         break;
   1429                     default:
   1430                         vConstMask = nullptr;
   1431                         break;
   1432                     }
   1433 
   1434                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
   1435                     // after pshufb for x channel
   1436                     // 256i - 0    1    2    3    4    5    6    7
   1437                     //        x000 x000 x000 x000 x000 x000 x000 x000
   1438 
   1439                     // denormalize if needed
   1440                     if (conversionType != CONVERT_NONE)
   1441                     {
   1442                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
   1443                     }
   1444                     currentVertexElement++;
   1445                 }
   1446                 else
   1447                 {
   1448                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1449                 }
   1450 
   1451                 if (currentVertexElement > 3)
   1452                 {
   1453                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1454                     // reset to the next vVertexElement to output
   1455                     currentVertexElement = 0;
   1456                 }
   1457             }
   1458         }
   1459     }
   1460     else
   1461     {
   1462         SWR_ASSERT(0, "Unsupported conversion type");
   1463     }
   1464 }
   1465 
   1466 //////////////////////////////////////////////////////////////////////////
   1467 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
   1468 /// denormalizes if needed, converts to F32 if needed, and positions in
   1469 //  the proper SIMD rows to be output to the simdvertex structure
   1470 /// @param args: (tuple of args, listed below)
   1471 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
   1472 ///   @param pVtxOut - base pointer to output simdvertex struct
   1473 ///   @param extendType - sign extend or zero extend
   1474 ///   @param bNormalized - do we need to denormalize?
   1475 ///   @param currentVertexElement - reference to the current vVertexElement
   1476 ///   @param outputElt - reference to the current offset from simdvertex we're o
   1477 ///   @param compMask - component packing mask
   1478 ///   @param compCtrl - component control val
   1479 ///   @param vVertexElements[4] - vertex components to output
   1480 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
   1481 {
   1482     // Unpack tuple args
   1483     Value* (&vGatherResult)[2] = std::get<0>(args);
   1484     Value* pVtxOut = std::get<1>(args);
   1485     const Instruction::CastOps extendType = std::get<2>(args);
   1486     const ConversionType conversionType = std::get<3>(args);
   1487     uint32_t &currentVertexElement = std::get<4>(args);
   1488     uint32_t &outputElt = std::get<5>(args);
   1489     const ComponentEnable compMask = std::get<6>(args);
   1490     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
   1491     Value* (&vVertexElements)[4] = std::get<8>(args);
   1492 
   1493     // cast types
   1494     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
   1495     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
   1496 
   1497     // have to do extra work for sign extending
   1498     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
   1499         (extendType == Instruction::CastOps::FPExt))
   1500     {
   1501         // is this PP float?
   1502         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
   1503 
   1504         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
   1505         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
   1506 
   1507         // shuffle mask
   1508         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
   1509                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
   1510         Value* vi128XY = nullptr;
   1511         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
   1512             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
   1513             // after pshufb: group components together in each 128bit lane
   1514             // 256i - 0    1    2    3    4    5    6    7
   1515             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
   1516 
   1517             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
   1518             // after PERMD: move and pack xy components into each 128bit lane
   1519             // 256i - 0    1    2    3    4    5    6    7
   1520             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
   1521         }
   1522 
   1523         // do the same for zw components
   1524         Value* vi128ZW = nullptr;
   1525         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
   1526             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
   1527             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
   1528         }
   1529 
   1530         // init denormalize variables if needed
   1531         Instruction::CastOps IntToFpCast;
   1532         Value* conversionFactor;
   1533 
   1534         switch (conversionType)
   1535         {
   1536         case CONVERT_NORMALIZED:
   1537             IntToFpCast = Instruction::CastOps::SIToFP;
   1538             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
   1539             break;
   1540         case CONVERT_SSCALED:
   1541             IntToFpCast = Instruction::CastOps::SIToFP;
   1542             conversionFactor = VIMMED1((float)(1.0));
   1543             break;
   1544         case CONVERT_USCALED:
   1545             SWR_ASSERT(0, "Type should not be sign extended!");
   1546             conversionFactor = nullptr;
   1547             break;
   1548         default:
   1549             SWR_ASSERT(conversionType == CONVERT_NONE);
   1550             conversionFactor = nullptr;
   1551             break;
   1552         }
   1553 
   1554         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
   1555         for (uint32_t i = 0; i < 4; i++)
   1556         {
   1557             if (isComponentEnabled(compMask, i))
   1558             {
   1559                 if (compCtrl[i] == ComponentControl::StoreSrc)
   1560                 {
   1561                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
   1562                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
   1563                     // if x or y, use vi128XY permute result, else use vi128ZW
   1564                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
   1565 
   1566                     if (bFP) {
   1567                         // extract 128 bit lanes to sign extend each component
   1568                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
   1569                     }
   1570                     else {
   1571                         // extract 128 bit lanes to sign extend each component
   1572                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
   1573 
   1574                         // denormalize if needed
   1575                         if (conversionType != CONVERT_NONE) {
   1576                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
   1577                         }
   1578                     }
   1579                     currentVertexElement++;
   1580                 }
   1581                 else
   1582                 {
   1583                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1584                 }
   1585 
   1586                 if (currentVertexElement > 3)
   1587                 {
   1588                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1589                     // reset to the next vVertexElement to output
   1590                     currentVertexElement = 0;
   1591                 }
   1592             }
   1593         }
   1594     }
   1595     // else zero extend
   1596     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
   1597     {
   1598         // pshufb masks for each component
   1599         Value* vConstMask[2];
   1600         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
   1601             // x/z shuffle mask
   1602             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
   1603                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
   1604         }
   1605 
   1606         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
   1607             // y/w shuffle mask
   1608             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
   1609                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
   1610         }
   1611 
   1612         // init denormalize variables if needed
   1613         Instruction::CastOps fpCast;
   1614         Value* conversionFactor;
   1615 
   1616         switch (conversionType)
   1617         {
   1618         case CONVERT_NORMALIZED:
   1619             fpCast = Instruction::CastOps::UIToFP;
   1620             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
   1621             break;
   1622         case CONVERT_USCALED:
   1623             fpCast = Instruction::CastOps::UIToFP;
   1624             conversionFactor = VIMMED1((float)(1.0f));
   1625             break;
   1626         case CONVERT_SSCALED:
   1627             SWR_ASSERT(0, "Type should not be zero extended!");
   1628             conversionFactor = nullptr;
   1629             break;
   1630         default:
   1631             SWR_ASSERT(conversionType == CONVERT_NONE);
   1632             conversionFactor = nullptr;
   1633             break;
   1634         }
   1635 
   1636         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
   1637         for (uint32_t i = 0; i < 4; i++)
   1638         {
   1639             if (isComponentEnabled(compMask, i))
   1640             {
   1641                 if (compCtrl[i] == ComponentControl::StoreSrc)
   1642                 {
   1643                     // select correct constMask for x/z or y/w pshufb
   1644                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
   1645                     // if x or y, use vi128XY permute result, else use vi128ZW
   1646                     uint32_t selectedGather = (i < 2) ? 0 : 1;
   1647 
   1648                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
   1649                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
   1650                     // 256i - 0    1    2    3    4    5    6    7
   1651                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
   1652 
   1653                     // denormalize if needed
   1654                     if (conversionType != CONVERT_NONE)
   1655                     {
   1656                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
   1657                     }
   1658                     currentVertexElement++;
   1659                 }
   1660                 else
   1661                 {
   1662                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
   1663                 }
   1664 
   1665                 if (currentVertexElement > 3)
   1666                 {
   1667                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
   1668                     // reset to the next vVertexElement to output
   1669                     currentVertexElement = 0;
   1670                 }
   1671             }
   1672         }
   1673     }
   1674     else
   1675     {
   1676         SWR_ASSERT(0, "Unsupported conversion type");
   1677     }
   1678 }
   1679 
   1680 //////////////////////////////////////////////////////////////////////////
   1681 /// @brief Output a simdvertex worth of elements to the current outputElt
   1682 /// @param pVtxOut - base address of VIN output struct
   1683 /// @param outputElt - simdvertex offset in VIN to write to
   1684 /// @param numEltsToStore - number of simdvertex rows to write out
   1685 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
   1686 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
   1687 {
   1688     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
   1689 
   1690     for(uint32_t c = 0; c < numEltsToStore; ++c)
   1691     {
   1692         // STORE expects FP32 x vWidth type, just bitcast if needed
   1693         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
   1694 #if FETCH_DUMP_VERTEX
   1695             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
   1696 #endif
   1697             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
   1698         }
   1699 #if FETCH_DUMP_VERTEX
   1700         else
   1701         {
   1702             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
   1703         }
   1704 #endif
   1705         // outputElt * 4 = offsetting by the size of a simdvertex
   1706         // + c offsets to a 32bit x vWidth row within the current vertex
   1707         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
   1708         STORE(vVertexElements[c], dest);
   1709     }
   1710 }
   1711 
   1712 //////////////////////////////////////////////////////////////////////////
   1713 /// @brief Generates a constant vector of values based on the
   1714 /// ComponentControl value
   1715 /// @param ctrl - ComponentControl value
   1716 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
   1717 {
   1718     switch(ctrl)
   1719     {
   1720         case NoStore:   return VUNDEF_I();
   1721         case Store0:    return VIMMED1(0);
   1722         case Store1Fp:  return VIMMED1(1.0f);
   1723         case Store1Int: return VIMMED1(1);
   1724         case StoreVertexId:
   1725         {
   1726             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
   1727             return VBROADCAST(pId);
   1728         }
   1729         case StoreInstanceId:
   1730         {
   1731             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
   1732             return VBROADCAST(pId);
   1733         }
   1734         case StoreSrc:
   1735         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
   1736     }
   1737 }
   1738 
   1739 //////////////////////////////////////////////////////////////////////////
   1740 /// @brief Returns the enable mask for the specified component.
   1741 /// @param enableMask - enable bits
   1742 /// @param component - component to check if enabled.
   1743 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
   1744 {
   1745     switch (component)
   1746     {
   1747         // X
   1748     case 0: return (enableMask & ComponentEnable::X);
   1749         // Y
   1750     case 1: return (enableMask & ComponentEnable::Y);
   1751         // Z
   1752     case 2: return (enableMask & ComponentEnable::Z);
   1753         // W
   1754     case 3: return (enableMask & ComponentEnable::W);
   1755 
   1756     default: return false;
   1757     }
   1758 }
   1759 
   1760 
   1761 //////////////////////////////////////////////////////////////////////////
   1762 /// @brief JITs from fetch shader IR
   1763 /// @param hJitMgr - JitManager handle
   1764 /// @param func   - LLVM function IR
   1765 /// @return PFN_FETCH_FUNC - pointer to fetch code
   1766 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
   1767 {
   1768     const llvm::Function* func = (const llvm::Function*)hFunc;
   1769     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
   1770     PFN_FETCH_FUNC pfnFetch;
   1771 
   1772     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
   1773     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
   1774     pJitMgr->mIsModuleFinalized = true;
   1775 
   1776 #if defined(KNOB_SWRC_TRACING)
   1777     char fName[1024];
   1778     const char *funcName = func->getName().data();
   1779     sprintf(fName, "%s.bin", funcName);
   1780     FILE *fd = fopen(fName, "wb");
   1781     fwrite((void *)pfnFetch, 1, 2048, fd);
   1782     fclose(fd);
   1783 #endif
   1784 
   1785     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
   1786 
   1787     return pfnFetch;
   1788 }
   1789 
   1790 //////////////////////////////////////////////////////////////////////////
   1791 /// @brief JIT compiles fetch shader
   1792 /// @param hJitMgr - JitManager handle
   1793 /// @param state   - fetch state to build function from
   1794 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
   1795 {
   1796     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
   1797 
   1798     pJitMgr->SetupNewModule();
   1799 
   1800     FetchJit theJit(pJitMgr);
   1801     HANDLE hFunc = theJit.Create(state);
   1802 
   1803     return JitFetchFunc(hJitMgr, hFunc);
   1804 }
   1805