Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file pa.h
     24 *
     25 * @brief Definitions for primitive assembly.
     26 *        N primitives are assembled at a time, where N is the SIMD width.
     27 *        A state machine, that is specific for a given topology, drives the
     28 *        assembly of vertices into triangles.
     29 *
     30 ******************************************************************************/
     31 #pragma once
     32 
     33 #include "frontend.h"
     34 
     35 struct PA_STATE
     36 {
     37 #if USE_SIMD16_FRONTEND
     38     enum
     39     {
     40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
     41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
     42         SIMD_WIDTH_LOG2 = 4
     43     };
     44 
     45     typedef         simd16mask          SIMDMASK;
     46 
     47     typedef         simd16scalar        SIMDSCALAR;
     48     typedef         simd16vector        SIMDVECTOR;
     49     typedef         simd16vertex        SIMDVERTEX;
     50 
     51     typedef         simd16scalari       SIMDSCALARI;
     52 
     53 #else
     54     enum
     55     {
     56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
     57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
     58         SIMD_WIDTH_LOG2 = 3
     59     };
     60 
     61     typedef         simdmask            SIMDMASK;
     62 
     63     typedef         simdscalar          SIMDSCALAR;
     64     typedef         simdvector          SIMDVECTOR;
     65     typedef         simdvertex          SIMDVERTEX;
     66 
     67     typedef         simdscalari         SIMDSCALARI;
     68 
     69 #endif
     70     DRAW_CONTEXT *pDC{ nullptr };       // draw context
     71     uint8_t* pStreamBase{ nullptr };    // vertex stream
     72     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
     73     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
     74 
     75     // The topology the binner will use. In some cases the FE changes the topology from the api state.
     76     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
     77 
     78 #if ENABLE_AVX512_SIMD16
     79     bool useAlternateOffset{ false };
     80 #endif
     81 
     82     bool viewportArrayActive{ false };
     83     bool rtArrayActive { false };
     84     uint32_t numVertsPerPrim{ 0 };
     85 
     86     PA_STATE(){}
     87     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) :
     88         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {}
     89 
     90     virtual bool HasWork() = 0;
     91     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
     92 #if ENABLE_AVX512_SIMD16
     93     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
     94 #endif
     95     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
     96 #if ENABLE_AVX512_SIMD16
     97     virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
     98 #endif
     99     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
    100     virtual bool NextPrim() = 0;
    101     virtual SIMDVERTEX& GetNextVsOutput() = 0;
    102     virtual bool GetNextStreamOutput() = 0;
    103     virtual SIMDMASK& GetNextVsIndices() = 0;
    104     virtual uint32_t NumPrims() = 0;
    105     virtual void Reset() = 0;
    106     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
    107 };
    108 
    109 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
    110 // output. Here is the sequence
    111 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
    112 //    2. Execute PA function to assemble and bin triangles.
    113 //        a.    The PA function is a set of functions that collectively make up the
    114 //            state machine for a given topology.
    115 //                1.    We use a state index to track which PA function to call.
    116 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
    117 //                1.    We call this the current and previous simd vertex.
    118 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
    119 //                    order to assemble the second triangle, for a triangle list, we'll need the
    120 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
    121 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
    122 //
    123 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
    124 // cuts
    125 struct PA_STATE_OPT : public PA_STATE
    126 {
    127     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
    128     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
    129 
    130     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
    131 
    132     uint32_t cur{ 0 };                   // index to current VS output.
    133     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
    134     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
    135 
    136     uint32_t counter{ 0 };               // state counter
    137     bool reset{ false };                 // reset state
    138 
    139     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
    140     SIMDSCALARI primID;
    141 
    142     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    143 #if ENABLE_AVX512_SIMD16
    144     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    145 #endif
    146     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    147 
    148     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
    149 #if ENABLE_AVX512_SIMD16
    150     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
    151 #endif
    152     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
    153     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
    154 #if ENABLE_AVX512_SIMD16
    155     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
    156 #endif
    157 
    158     // state used to advance the PA when Next is called
    159     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
    160 #if ENABLE_AVX512_SIMD16
    161     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
    162 #endif
    163     uint32_t           nextNumSimdPrims{ 0 };
    164     uint32_t           nextNumPrimsIncrement{ 0 };
    165     bool               nextReset{ false };
    166     bool               isStreaming{ false };
    167 
    168     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
    169 
    170     PA_STATE_OPT() {}
    171     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
    172         uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
    173 
    174     bool HasWork()
    175     {
    176         return (this->numPrimsComplete < this->numPrims) ? true : false;
    177     }
    178 
    179     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    180     {
    181         SWR_ASSERT(slot < vertexStride);
    182         uint32_t offset = index * vertexStride + slot;
    183         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
    184         return vertexSlot;
    185     }
    186 
    187 #if ENABLE_AVX512_SIMD16
    188     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
    189     {
    190         SWR_ASSERT(slot < vertexStride);
    191         uint32_t offset = index * vertexStride + slot;
    192         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
    193         return vertexSlot;
    194     }
    195 
    196 #endif
    197     // Assembles 4 triangles. Each simdvector is a single vertex from 4
    198     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
    199     bool Assemble(uint32_t slot, simdvector verts[])
    200     {
    201         return this->pfnPaFunc(*this, slot, verts);
    202     }
    203 
    204 #if ENABLE_AVX512_SIMD16
    205     bool Assemble(uint32_t slot, simd16vector verts[])
    206     {
    207         return this->pfnPaFunc_simd16(*this, slot, verts);
    208     }
    209 
    210 #endif
    211     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
    212     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    213     {
    214         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
    215     }
    216 
    217     bool NextPrim()
    218     {
    219         this->pfnPaFunc = this->pfnPaNextFunc;
    220 #if ENABLE_AVX512_SIMD16
    221         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
    222 #endif
    223         this->numSimdPrims = this->nextNumSimdPrims;
    224         this->numPrimsComplete += this->nextNumPrimsIncrement;
    225         this->reset = this->nextReset;
    226 
    227         if (this->isStreaming)
    228         {
    229             this->reset = false;
    230         }
    231 
    232         bool morePrims = false;
    233 
    234         if (this->numSimdPrims > 0)
    235         {
    236             morePrims = true;
    237             this->numSimdPrims--;
    238         }
    239         else
    240         {
    241             this->counter = (this->reset) ? 0 : (this->counter + 1);
    242             this->reset = false;
    243         }
    244 
    245         if (!HasWork())
    246         {
    247             morePrims = false;    // no more to do
    248         }
    249 
    250         return morePrims;
    251     }
    252 
    253     SIMDVERTEX& GetNextVsOutput()
    254     {
    255         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
    256 
    257         // increment cur and prev indices
    258         if (counter < numSimdVerts)
    259         {
    260             // prev undefined for first state
    261             prev = cur;
    262             cur = counter;
    263         }
    264         else
    265         {
    266             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
    267             uint32_t temp = prev;
    268 
    269             prev = cur;
    270             cur = temp;
    271         }
    272 
    273         SWR_ASSERT(cur < numSimdVerts);
    274         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
    275 
    276         return *(SIMDVERTEX*)pVertex;
    277     }
    278 
    279     SIMDMASK& GetNextVsIndices()
    280     {
    281         // unused in optimized PA, pass tmp buffer back
    282         return junkIndices;
    283     }
    284 
    285     bool GetNextStreamOutput()
    286     {
    287         this->prev = this->cur;
    288         this->cur = this->counter;
    289 
    290         return HasWork();
    291     }
    292 
    293     uint32_t NumPrims()
    294     {
    295         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
    296             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
    297     }
    298 
    299     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
    300         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
    301         uint32_t numSimdPrims = 0,
    302         uint32_t numPrimsIncrement = 0,
    303         bool reset = false)
    304     {
    305         this->pfnPaNextFunc = pfnPaNextFunc;
    306         this->nextNumSimdPrims = numSimdPrims;
    307         this->nextNumPrimsIncrement = numPrimsIncrement;
    308         this->nextReset = reset;
    309 
    310         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
    311     }
    312 
    313 #if ENABLE_AVX512_SIMD16
    314     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
    315         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
    316         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
    317         uint32_t numSimdPrims = 0,
    318         uint32_t numPrimsIncrement = 0,
    319         bool reset = false)
    320     {
    321         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
    322         this->pfnPaNextFunc = pfnPaNextFunc;
    323         this->nextNumSimdPrims = numSimdPrims;
    324         this->nextNumPrimsIncrement = numPrimsIncrement;
    325         this->nextReset = reset;
    326 
    327         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
    328     }
    329 
    330 #endif
    331     void Reset()
    332     {
    333 #if ENABLE_AVX512_SIMD16
    334         useAlternateOffset = false;
    335 
    336 #endif
    337         this->pfnPaFunc = this->pfnPaFuncReset;
    338 #if ENABLE_AVX512_SIMD16
    339         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
    340 #endif
    341         this->numPrimsComplete = 0;
    342         this->numSimdPrims = 0;
    343         this->cur = 0;
    344         this->prev = 0;
    345         this->counter = 0;
    346         this->reset = false;
    347     }
    348 
    349     SIMDSCALARI GetPrimID(uint32_t startID)
    350     {
    351 #if USE_SIMD16_FRONTEND
    352         return _simd16_add_epi32(this->primID,
    353             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
    354 #else
    355         return _simd_add_epi32(this->primID,
    356             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
    357 #endif
    358     }
    359 };
    360 
    361 // helper C wrappers to avoid having to rewrite all the PA topology state functions
    362 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
    363     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
    364     uint32_t numSimdPrims = 0,
    365     uint32_t numPrimsIncrement = 0,
    366     bool reset = false)
    367 {
    368     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
    369 }
    370 
    371 #if ENABLE_AVX512_SIMD16
    372 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
    373     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
    374     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
    375     uint32_t numSimdPrims = 0,
    376     uint32_t numPrimsIncrement = 0,
    377     bool reset = false)
    378 {
    379     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
    380 }
    381 
    382 #endif
    383 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
    384 {
    385     return pa.GetSimdVector(index, slot);
    386 }
    387 
    388 #if ENABLE_AVX512_SIMD16
    389 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
    390 {
    391     return pa.GetSimdVector_simd16(index, slot);
    392 }
    393 
    394 #endif
    395 // Cut-aware primitive assembler.
    396 struct PA_STATE_CUT : public PA_STATE
    397 {
    398     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
    399     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
    400     uint32_t numAttribs{ 0 };            // number of attributes
    401     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
    402     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
    403 #if ENABLE_AVX512_SIMD16
    404     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
    405 #else
    406     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
    407 #endif
    408     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
    409     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
    410     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
    411     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
    412     uint32_t curVertex{ 0 };             // current unprocessed vertex
    413     uint32_t startPrimId{ 0 };           // starting prim id
    414     SIMDSCALARI vPrimId;                 // vector of prim ID
    415     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
    416     uint32_t vertsPerPrim{ 0 };
    417     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
    418                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
    419                                          // while the GS sends valid verts for every index
    420 
    421     simdvector      junkVector;          // junk simdvector for unimplemented API
    422 #if ENABLE_AVX512_SIMD16
    423     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
    424 #endif
    425 
    426     // Topology state tracking
    427     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
    428     uint32_t curIndex{ 0 };
    429     bool reverseWinding{ false };        // indicates reverse winding for strips
    430     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
    431 
    432     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
    433     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
    434 
    435     PA_STATE_CUT() {}
    436     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
    437         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim)
    438         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
    439     {
    440         numVerts = in_streamSizeInVerts;
    441         numAttribs = in_numAttribs;
    442         binTopology = topo;
    443         needOffsets = false;
    444         processCutVerts = in_processCutVerts;
    445 
    446         numVertsToAssemble = numRemainingVerts = in_numVerts;
    447         numPrimsAssembled = 0;
    448         headVertex = tailVertex = curVertex = 0;
    449 
    450         curIndex = 0;
    451         pCutIndices = in_pIndices;
    452         memset(indices, 0, sizeof(indices));
    453 #if USE_SIMD16_FRONTEND
    454         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    455 #else
    456         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
    457 #endif
    458         reverseWinding = false;
    459         adjExtraVert = -1;
    460 
    461         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
    462         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
    463 
    464         switch (topo)
    465         {
    466         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
    467         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
    468         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
    469         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
    470                                     {
    471                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
    472                                     }
    473                                     else
    474                                     {
    475                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
    476                                     }
    477                                     break;
    478 
    479         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
    480         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
    481         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
    482         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
    483         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
    484         default: assert(0 && "Unimplemented topology");
    485         }
    486     }
    487 
    488     SIMDVERTEX& GetNextVsOutput()
    489     {
    490         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
    491         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
    492         this->needOffsets = true;
    493         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
    494 
    495         return *(SIMDVERTEX*)pVertex;
    496     }
    497 
    498     SIMDMASK& GetNextVsIndices()
    499     {
    500         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
    501         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
    502         return *pCurCutIndex;
    503     }
    504 
    505     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    506     {
    507         // unused
    508         SWR_ASSERT(0 && "Not implemented");
    509         return junkVector;
    510     }
    511 
    512 #if ENABLE_AVX512_SIMD16
    513     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
    514     {
    515         // unused
    516         SWR_ASSERT(0 && "Not implemented");
    517         return junkVector_simd16;
    518     }
    519 
    520 #endif
    521     bool GetNextStreamOutput()
    522     {
    523         this->headVertex += SIMD_WIDTH;
    524         this->needOffsets = true;
    525         return HasWork();
    526     }
    527 
    528     SIMDSCALARI GetPrimID(uint32_t startID)
    529     {
    530 #if USE_SIMD16_FRONTEND
    531         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
    532 #else
    533         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
    534 #endif
    535     }
    536 
    537     void Reset()
    538     {
    539 #if ENABLE_AVX512_SIMD16
    540         useAlternateOffset = false;
    541 
    542 #endif
    543         this->numRemainingVerts = this->numVertsToAssemble;
    544         this->numPrimsAssembled = 0;
    545         this->curIndex = 0;
    546         this->curVertex = 0;
    547         this->tailVertex = 0;
    548         this->headVertex = 0;
    549         this->reverseWinding = false;
    550         this->adjExtraVert = -1;
    551 #if USE_SIMD16_FRONTEND
    552         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    553 #else
    554         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
    555 #endif
    556     }
    557 
    558     bool HasWork()
    559     {
    560         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
    561     }
    562 
    563     bool IsVertexStoreFull()
    564     {
    565         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
    566     }
    567 
    568     void RestartTopology()
    569     {
    570         this->curIndex = 0;
    571         this->reverseWinding = false;
    572         this->adjExtraVert = -1;
    573     }
    574 
    575     bool IsCutIndex(uint32_t vertex)
    576     {
    577         uint32_t vertexIndex = vertex / SIMD_WIDTH;
    578         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
    579         return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
    580     }
    581 
    582     // iterates across the unprocessed verts until we hit the end or we
    583     // have assembled SIMD prims
    584     void ProcessVerts()
    585     {
    586         while (this->numPrimsAssembled != SIMD_WIDTH &&
    587             this->numRemainingVerts > 0 &&
    588             this->curVertex != this->headVertex)
    589         {
    590             // if cut index, restart topology
    591             if (IsCutIndex(this->curVertex))
    592             {
    593                 if (this->processCutVerts)
    594                 {
    595                     (this->*pfnPa)(this->curVertex, false);
    596                 }
    597                 // finish off tri strip w/ adj before restarting topo
    598                 if (this->adjExtraVert != -1)
    599                 {
    600                     (this->*pfnPa)(this->curVertex, true);
    601                 }
    602                 RestartTopology();
    603             }
    604             else
    605             {
    606                 (this->*pfnPa)(this->curVertex, false);
    607             }
    608 
    609             this->curVertex++;
    610             if (this->curVertex >= this->numVerts) {
    611                this->curVertex = 0;
    612             }
    613             this->numRemainingVerts--;
    614         }
    615 
    616         // special case last primitive for tri strip w/ adj
    617         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
    618         {
    619             (this->*pfnPa)(this->curVertex, true);
    620         }
    621     }
    622 
    623     void Advance()
    624     {
    625         // done with current batch
    626         // advance tail to the current unsubmitted vertex
    627         this->tailVertex = this->curVertex;
    628         this->numPrimsAssembled = 0;
    629 #if USE_SIMD16_FRONTEND
    630         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
    631 #else
    632         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
    633 #endif
    634     }
    635 
    636     bool NextPrim()
    637     {
    638         // if we've assembled enough prims, we can advance to the next set of verts
    639         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
    640         {
    641             Advance();
    642         }
    643         return false;
    644     }
    645 
    646     void ComputeOffsets()
    647     {
    648         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    649         {
    650             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
    651             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
    652 
    653             // step to simdvertex batch
    654             const uint32_t simdShift = SIMD_WIDTH_LOG2;
    655 #if USE_SIMD16_FRONTEND
    656             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
    657             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
    658 #else
    659             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
    660             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
    661 #endif
    662 
    663             // step to index
    664             const uint32_t simdMask = SIMD_WIDTH - 1;
    665 #if USE_SIMD16_FRONTEND
    666             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
    667             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
    668 #else
    669             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
    670             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
    671 #endif
    672         }
    673     }
    674 
    675     bool Assemble(uint32_t slot, simdvector *verts)
    676     {
    677         // process any outstanding verts
    678         ProcessVerts();
    679 
    680         // return false if we don't have enough prims assembled
    681         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
    682         {
    683             return false;
    684         }
    685 
    686         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
    687         if (this->needOffsets)
    688         {
    689             ComputeOffsets();
    690             this->needOffsets = false;
    691         }
    692 
    693         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    694         {
    695             SIMDSCALARI offsets = this->vOffsets[v];
    696 
    697             // step to attribute
    698 #if USE_SIMD16_FRONTEND
    699             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
    700 #else
    701             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
    702 #endif
    703 
    704             float* pBase = (float*)this->pStreamBase;
    705             for (uint32_t c = 0; c < 4; ++c)
    706             {
    707 #if USE_SIMD16_FRONTEND
    708                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
    709 
    710                 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
    711                 simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
    712                 verts[v].v[c] = t;
    713 #else
    714                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
    715 #endif
    716 
    717                 // move base to next component
    718                 pBase += SIMD_WIDTH;
    719             }
    720         }
    721 
    722         return true;
    723     }
    724 
    725 #if ENABLE_AVX512_SIMD16
    726     bool Assemble(uint32_t slot, simd16vector verts[])
    727     {
    728         // process any outstanding verts
    729         ProcessVerts();
    730 
    731         // return false if we don't have enough prims assembled
    732         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
    733         {
    734             return false;
    735         }
    736 
    737         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
    738         if (this->needOffsets)
    739         {
    740             ComputeOffsets();
    741             this->needOffsets = false;
    742         }
    743 
    744         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    745         {
    746             SIMDSCALARI offsets = this->vOffsets[v];
    747 
    748             // step to attribute
    749 #if USE_SIMD16_FRONTEND
    750             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
    751 #else
    752             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
    753 #endif
    754 
    755             float* pBase = (float*)this->pStreamBase;
    756             for (uint32_t c = 0; c < 4; ++c)
    757             {
    758 #if USE_SIMD16_FRONTEND
    759                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
    760 #else
    761                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
    762 #endif
    763 
    764                 // move base to next component
    765                 pBase += SIMD_WIDTH;
    766             }
    767         }
    768 
    769         return true;
    770     }
    771 
    772 #endif
    773     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
    774     {
    775         // move to slot
    776         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    777         {
    778             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
    779 #if USE_SIMD16_FRONTEND
    780             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
    781 #else
    782             uint32_t offset = pOffset[triIndex];
    783 #endif
    784             offset += sizeof(SIMDVECTOR) * slot;
    785             float* pVert = (float*)&tri[v];
    786             for (uint32_t c = 0; c < 4; ++c)
    787             {
    788                 float* pComponent = (float*)(this->pStreamBase + offset);
    789                 pVert[c] = *pComponent;
    790                 offset += SIMD_WIDTH * sizeof(float);
    791             }
    792         }
    793     }
    794 
    795     uint32_t NumPrims()
    796     {
    797         return this->numPrimsAssembled;
    798     }
    799 
    800     // Per-topology functions
    801     void ProcessVertTriStrip(uint32_t index, bool finish)
    802     {
    803         this->vert[this->curIndex] = index;
    804         this->curIndex++;
    805         if (this->curIndex == 3)
    806         {
    807             // assembled enough verts for prim, add to gather indices
    808             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    809             if (reverseWinding)
    810             {
    811                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
    812                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
    813             }
    814             else
    815             {
    816                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
    817                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
    818             }
    819 
    820             // increment numPrimsAssembled
    821             this->numPrimsAssembled++;
    822 
    823             // set up next prim state
    824             this->vert[0] = this->vert[1];
    825             this->vert[1] = this->vert[2];
    826             this->curIndex = 2;
    827             this->reverseWinding ^= 1;
    828         }
    829     }
    830 
    831     template<bool gsEnabled>
    832     void AssembleTriStripAdj()
    833     {
    834         if (!gsEnabled)
    835         {
    836             this->vert[1] = this->vert[2];
    837             this->vert[2] = this->vert[4];
    838 
    839             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    840             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    841             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    842 
    843             this->vert[4] = this->vert[2];
    844             this->vert[2] = this->vert[1];
    845         }
    846         else
    847         {
    848             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    849             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    850             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    851             this->indices[3][this->numPrimsAssembled] = this->vert[3];
    852             this->indices[4][this->numPrimsAssembled] = this->vert[4];
    853             this->indices[5][this->numPrimsAssembled] = this->vert[5];
    854         }
    855         this->numPrimsAssembled++;
    856     }
    857 
    858 
    859     template<bool gsEnabled>
    860     void ProcessVertTriStripAdj(uint32_t index, bool finish)
    861     {
    862         // handle last primitive of tristrip
    863         if (finish && this->adjExtraVert != -1)
    864         {
    865             this->vert[3] = this->adjExtraVert;
    866             AssembleTriStripAdj<gsEnabled>();
    867             this->adjExtraVert = -1;
    868             return;
    869         }
    870 
    871         switch (this->curIndex)
    872         {
    873         case 0:
    874         case 1:
    875         case 2:
    876         case 4:
    877             this->vert[this->curIndex] = index;
    878             this->curIndex++;
    879             break;
    880         case 3:
    881             this->vert[5] = index;
    882             this->curIndex++;
    883             break;
    884         case 5:
    885             if (this->adjExtraVert == -1)
    886             {
    887                 this->adjExtraVert = index;
    888             }
    889             else
    890             {
    891                 this->vert[3] = index;
    892                 if (!gsEnabled)
    893                 {
    894                     AssembleTriStripAdj<gsEnabled>();
    895 
    896                     uint32_t nextTri[6];
    897                     if (this->reverseWinding)
    898                     {
    899                         nextTri[0] = this->vert[4];
    900                         nextTri[1] = this->vert[0];
    901                         nextTri[2] = this->vert[2];
    902                         nextTri[4] = this->vert[3];
    903                         nextTri[5] = this->adjExtraVert;
    904                     }
    905                     else
    906                     {
    907                         nextTri[0] = this->vert[2];
    908                         nextTri[1] = this->adjExtraVert;
    909                         nextTri[2] = this->vert[3];
    910                         nextTri[4] = this->vert[4];
    911                         nextTri[5] = this->vert[0];
    912                     }
    913                     for (uint32_t i = 0; i < 6; ++i)
    914                     {
    915                         this->vert[i] = nextTri[i];
    916                     }
    917 
    918                     this->adjExtraVert = -1;
    919                     this->reverseWinding ^= 1;
    920                 }
    921                 else
    922                 {
    923                     this->curIndex++;
    924                 }
    925             }
    926             break;
    927         case 6:
    928             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
    929             AssembleTriStripAdj<gsEnabled>();
    930 
    931             uint32_t nextTri[6];
    932             if (this->reverseWinding)
    933             {
    934                 nextTri[0] = this->vert[4];
    935                 nextTri[1] = this->vert[0];
    936                 nextTri[2] = this->vert[2];
    937                 nextTri[4] = this->vert[3];
    938                 nextTri[5] = this->adjExtraVert;
    939             }
    940             else
    941             {
    942                 nextTri[0] = this->vert[2];
    943                 nextTri[1] = this->adjExtraVert;
    944                 nextTri[2] = this->vert[3];
    945                 nextTri[4] = this->vert[4];
    946                 nextTri[5] = this->vert[0];
    947             }
    948             for (uint32_t i = 0; i < 6; ++i)
    949             {
    950                 this->vert[i] = nextTri[i];
    951             }
    952             this->reverseWinding ^= 1;
    953             this->adjExtraVert = index;
    954             this->curIndex--;
    955             break;
    956         }
    957     }
    958 
    959     void ProcessVertTriList(uint32_t index, bool finish)
    960     {
    961         this->vert[this->curIndex] = index;
    962         this->curIndex++;
    963         if (this->curIndex == 3)
    964         {
    965             // assembled enough verts for prim, add to gather indices
    966             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    967             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    968             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    969 
    970             // increment numPrimsAssembled
    971             this->numPrimsAssembled++;
    972 
    973             // set up next prim state
    974             this->curIndex = 0;
    975         }
    976     }
    977 
    978     void ProcessVertTriListAdj(uint32_t index, bool finish)
    979     {
    980         this->vert[this->curIndex] = index;
    981         this->curIndex++;
    982         if (this->curIndex == 6)
    983         {
    984             // assembled enough verts for prim, add to gather indices
    985             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    986             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    987             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    988             this->indices[3][this->numPrimsAssembled] = this->vert[3];
    989             this->indices[4][this->numPrimsAssembled] = this->vert[4];
    990             this->indices[5][this->numPrimsAssembled] = this->vert[5];
    991 
    992             // increment numPrimsAssembled
    993             this->numPrimsAssembled++;
    994 
    995             // set up next prim state
    996             this->curIndex = 0;
    997         }
    998     }
    999 
   1000     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
   1001     {
   1002         this->vert[this->curIndex] = index;
   1003         this->curIndex++;
   1004         if (this->curIndex == 6)
   1005         {
   1006             // assembled enough verts for prim, add to gather indices
   1007             this->indices[0][this->numPrimsAssembled] = this->vert[0];
   1008             this->indices[1][this->numPrimsAssembled] = this->vert[2];
   1009             this->indices[2][this->numPrimsAssembled] = this->vert[4];
   1010 
   1011             // increment numPrimsAssembled
   1012             this->numPrimsAssembled++;
   1013 
   1014             // set up next prim state
   1015             this->curIndex = 0;
   1016         }
   1017     }
   1018 
   1019 
   1020     void ProcessVertLineList(uint32_t index, bool finish)
   1021     {
   1022         this->vert[this->curIndex] = index;
   1023         this->curIndex++;
   1024         if (this->curIndex == 2)
   1025         {
   1026             this->indices[0][this->numPrimsAssembled] = this->vert[0];
   1027             this->indices[1][this->numPrimsAssembled] = this->vert[1];
   1028 
   1029             this->numPrimsAssembled++;
   1030             this->curIndex = 0;
   1031         }
   1032     }
   1033 
   1034     void ProcessVertLineStrip(uint32_t index, bool finish)
   1035     {
   1036         this->vert[this->curIndex] = index;
   1037         this->curIndex++;
   1038         if (this->curIndex == 2)
   1039         {
   1040             // assembled enough verts for prim, add to gather indices
   1041             this->indices[0][this->numPrimsAssembled] = this->vert[0];
   1042             this->indices[1][this->numPrimsAssembled] = this->vert[1];
   1043 
   1044             // increment numPrimsAssembled
   1045             this->numPrimsAssembled++;
   1046 
   1047             // set up next prim state
   1048             this->vert[0] = this->vert[1];
   1049             this->curIndex = 1;
   1050         }
   1051     }
   1052 
   1053     void ProcessVertLineStripAdj(uint32_t index, bool finish)
   1054     {
   1055         this->vert[this->curIndex] = index;
   1056         this->curIndex++;
   1057         if (this->curIndex == 4)
   1058         {
   1059             // assembled enough verts for prim, add to gather indices
   1060             this->indices[0][this->numPrimsAssembled] = this->vert[0];
   1061             this->indices[1][this->numPrimsAssembled] = this->vert[1];
   1062             this->indices[2][this->numPrimsAssembled] = this->vert[2];
   1063             this->indices[3][this->numPrimsAssembled] = this->vert[3];
   1064 
   1065             // increment numPrimsAssembled
   1066             this->numPrimsAssembled++;
   1067 
   1068             // set up next prim state
   1069             this->vert[0] = this->vert[1];
   1070             this->vert[1] = this->vert[2];
   1071             this->vert[2] = this->vert[3];
   1072             this->curIndex = 3;
   1073         }
   1074     }
   1075 
   1076     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
   1077     {
   1078         this->vert[this->curIndex] = index;
   1079         this->curIndex++;
   1080         if (this->curIndex == 4)
   1081         {
   1082             // assembled enough verts for prim, add to gather indices
   1083             this->indices[0][this->numPrimsAssembled] = this->vert[1];
   1084             this->indices[1][this->numPrimsAssembled] = this->vert[2];
   1085 
   1086             // increment numPrimsAssembled
   1087             this->numPrimsAssembled++;
   1088 
   1089             // set up next prim state
   1090             this->vert[0] = this->vert[1];
   1091             this->vert[1] = this->vert[2];
   1092             this->vert[2] = this->vert[3];
   1093             this->curIndex = 3;
   1094         }
   1095     }
   1096 
   1097     void ProcessVertLineListAdj(uint32_t index, bool finish)
   1098     {
   1099         this->vert[this->curIndex] = index;
   1100         this->curIndex++;
   1101         if (this->curIndex == 4)
   1102         {
   1103             this->indices[0][this->numPrimsAssembled] = this->vert[0];
   1104             this->indices[1][this->numPrimsAssembled] = this->vert[1];
   1105             this->indices[2][this->numPrimsAssembled] = this->vert[2];
   1106             this->indices[3][this->numPrimsAssembled] = this->vert[3];
   1107 
   1108             this->numPrimsAssembled++;
   1109             this->curIndex = 0;
   1110         }
   1111     }
   1112 
   1113     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
   1114     {
   1115         this->vert[this->curIndex] = index;
   1116         this->curIndex++;
   1117         if (this->curIndex == 4)
   1118         {
   1119             this->indices[0][this->numPrimsAssembled] = this->vert[1];
   1120             this->indices[1][this->numPrimsAssembled] = this->vert[2];
   1121 
   1122             this->numPrimsAssembled++;
   1123             this->curIndex = 0;
   1124         }
   1125     }
   1126 
   1127     void ProcessVertPointList(uint32_t index, bool finish)
   1128     {
   1129         this->vert[this->curIndex] = index;
   1130         this->curIndex++;
   1131         if (this->curIndex == 1)
   1132         {
   1133             this->indices[0][this->numPrimsAssembled] = this->vert[0];
   1134             this->numPrimsAssembled++;
   1135             this->curIndex = 0;
   1136         }
   1137     }
   1138 };
   1139 
   1140 // Primitive Assembly for data output from the DomainShader.
   1141 struct PA_TESS : PA_STATE
   1142 {
   1143     PA_TESS(
   1144         DRAW_CONTEXT *in_pDC,
   1145         const SIMDSCALAR* in_pVertData,
   1146         uint32_t in_attributeStrideInVectors,
   1147         uint32_t in_vertexStride,
   1148         uint32_t in_numAttributes,
   1149         uint32_t* (&in_ppIndices)[3],
   1150         uint32_t in_numPrims,
   1151         PRIMITIVE_TOPOLOGY in_binTopology,
   1152         uint32_t numVertsPerPrim) :
   1153 
   1154         PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
   1155         m_pVertexData(in_pVertData),
   1156         m_attributeStrideInVectors(in_attributeStrideInVectors),
   1157         m_numAttributes(in_numAttributes),
   1158         m_numPrims(in_numPrims)
   1159     {
   1160 #if USE_SIMD16_FRONTEND
   1161         m_vPrimId = _simd16_setzero_si();
   1162 #else
   1163         m_vPrimId = _simd_setzero_si();
   1164 #endif
   1165         binTopology = in_binTopology;
   1166         m_ppIndices[0] = in_ppIndices[0];
   1167         m_ppIndices[1] = in_ppIndices[1];
   1168         m_ppIndices[2] = in_ppIndices[2];
   1169 
   1170         switch (binTopology)
   1171         {
   1172         case TOP_POINT_LIST:
   1173             m_numVertsPerPrim = 1;
   1174             break;
   1175 
   1176         case TOP_LINE_LIST:
   1177             m_numVertsPerPrim = 2;
   1178             break;
   1179 
   1180         case TOP_TRIANGLE_LIST:
   1181             m_numVertsPerPrim = 3;
   1182             break;
   1183 
   1184         default:
   1185             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
   1186             break;
   1187         }
   1188     }
   1189 
   1190     bool HasWork()
   1191     {
   1192         return m_numPrims != 0;
   1193     }
   1194 
   1195     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
   1196     {
   1197         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
   1198         return junkVector;
   1199     }
   1200 
   1201 #if ENABLE_AVX512_SIMD16
   1202     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
   1203     {
   1204         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
   1205         return junkVector_simd16;
   1206     }
   1207 
   1208 #endif
   1209     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
   1210     {
   1211         SWR_ASSERT(numPrims <= SIMD_WIDTH);
   1212 #if USE_SIMD16_FRONTEND
   1213         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
   1214         {
   1215             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1216             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
   1217         };
   1218 
   1219         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
   1220 #else
   1221         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
   1222         {
   1223             -1, -1, -1, -1, -1, -1, -1, -1,
   1224             0,  0,  0,  0,  0,  0,  0,  0
   1225         };
   1226 
   1227         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
   1228 #endif
   1229     }
   1230 
   1231     bool Assemble(uint32_t slot, simdvector verts[])
   1232     {
   1233         SWR_ASSERT(slot < m_numAttributes);
   1234 
   1235         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
   1236         if (0 == numPrimsToAssemble)
   1237         {
   1238             return false;
   1239         }
   1240 
   1241         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
   1242 
   1243         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
   1244         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
   1245         {
   1246 #if USE_SIMD16_FRONTEND
   1247             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
   1248 #else
   1249             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
   1250 #endif
   1251 
   1252             const float* pBase = pBaseAttrib;
   1253             for (uint32_t c = 0; c < 4; ++c)
   1254             {
   1255 #if USE_SIMD16_FRONTEND
   1256                 simd16scalar temp = _simd16_mask_i32gather_ps(
   1257                     _simd16_setzero_ps(),
   1258                     pBase,
   1259                     indices,
   1260                     _simd16_castsi_ps(mask),
   1261                     4 /* gcc doesn't like sizeof(float) */);
   1262 
   1263                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
   1264 #else
   1265                 verts[i].v[c] = _simd_mask_i32gather_ps(
   1266                     _simd_setzero_ps(),
   1267                     pBase,
   1268                     indices,
   1269                     _simd_castsi_ps(mask),
   1270                     4); // gcc doesn't like sizeof(float)
   1271 #endif
   1272                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
   1273             }
   1274         }
   1275 
   1276         return true;
   1277     }
   1278 
   1279 #if ENABLE_AVX512_SIMD16
   1280     bool Assemble(uint32_t slot, simd16vector verts[])
   1281     {
   1282         SWR_ASSERT(slot < m_numAttributes);
   1283 
   1284         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
   1285         if (0 == numPrimsToAssemble)
   1286         {
   1287             return false;
   1288         }
   1289 
   1290         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
   1291 
   1292         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
   1293         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
   1294         {
   1295 #if USE_SIMD16_FRONTEND
   1296             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
   1297 #else
   1298             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
   1299 #endif
   1300 
   1301             const float* pBase = pBaseAttrib;
   1302             for (uint32_t c = 0; c < 4; ++c)
   1303             {
   1304 #if USE_SIMD16_FRONTEND
   1305                 verts[i].v[c] = _simd16_mask_i32gather_ps(
   1306                     _simd16_setzero_ps(),
   1307                     pBase,
   1308                     indices,
   1309                     _simd16_castsi_ps(mask),
   1310                     4 /* gcc doesn't like sizeof(float) */);
   1311 #else
   1312                 simdscalar temp = _simd_mask_i32gather_ps(
   1313                     _simd_setzero_ps(),
   1314                     pBase,
   1315                     indices,
   1316                     _simd_castsi_ps(mask),
   1317                     4 /* gcc doesn't like sizeof(float) */);
   1318                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
   1319 #endif
   1320                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
   1321             }
   1322         }
   1323 
   1324         return true;
   1325     }
   1326 
   1327 #endif
   1328     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   1329     {
   1330         SWR_ASSERT(slot < m_numAttributes);
   1331         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
   1332 
   1333         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
   1334         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
   1335         {
   1336 #if USE_SIMD16_FRONTEND
   1337             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
   1338 #else
   1339             uint32_t index = m_ppIndices[i][primIndex];
   1340 #endif
   1341             const float* pVertData = pVertDataBase;
   1342             float* pVert = (float*)&verts[i];
   1343 
   1344             for (uint32_t c = 0; c < 4; ++c)
   1345             {
   1346                 pVert[c] = pVertData[index];
   1347                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
   1348             }
   1349         }
   1350     }
   1351 
   1352     bool NextPrim()
   1353     {
   1354         uint32_t numPrims = PA_TESS::NumPrims();
   1355         m_numPrims -= numPrims;
   1356         m_ppIndices[0] += numPrims;
   1357         m_ppIndices[1] += numPrims;
   1358         m_ppIndices[2] += numPrims;
   1359 
   1360         return HasWork();
   1361     }
   1362 
   1363     SIMDVERTEX& GetNextVsOutput()
   1364     {
   1365         SWR_NOT_IMPL;
   1366         return junkVertex;
   1367     }
   1368 
   1369     bool GetNextStreamOutput()
   1370     {
   1371         SWR_NOT_IMPL;
   1372         return false;
   1373     }
   1374 
   1375     SIMDMASK& GetNextVsIndices()
   1376     {
   1377         SWR_NOT_IMPL;
   1378         return junkIndices;
   1379     }
   1380 
   1381     uint32_t NumPrims()
   1382     {
   1383         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
   1384     }
   1385 
   1386     void Reset()
   1387     {
   1388         SWR_NOT_IMPL;
   1389     }
   1390 
   1391     SIMDSCALARI GetPrimID(uint32_t startID)
   1392     {
   1393 #if USE_SIMD16_FRONTEND
   1394         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
   1395 #else
   1396         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
   1397 #endif
   1398     }
   1399 
   1400 private:
   1401     const SIMDSCALAR*   m_pVertexData = nullptr;
   1402     uint32_t            m_attributeStrideInVectors = 0;
   1403     uint32_t            m_numAttributes = 0;
   1404     uint32_t            m_numPrims = 0;
   1405     uint32_t*           m_ppIndices[3];
   1406 
   1407     uint32_t            m_numVertsPerPrim = 0;
   1408 
   1409     SIMDSCALARI         m_vPrimId;
   1410 
   1411     simdvector          junkVector;         // junk simdvector for unimplemented API
   1412 #if ENABLE_AVX512_SIMD16
   1413     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
   1414 #endif
   1415     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
   1416     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
   1417 };
   1418 
   1419 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
   1420 // based on state.
   1421 template <typename IsIndexedT, typename IsCutIndexEnabledT>
   1422 struct PA_FACTORY
   1423 {
   1424     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo)
   1425     {
   1426 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
   1427         const API_STATE& state = GetApiState(pDC);
   1428         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
   1429             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
   1430             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
   1431             topo == TOP_TRIANGLE_LIST)) ||
   1432 
   1433             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
   1434             // for them in the optimized PA
   1435             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
   1436         {
   1437             memset(&indexStore, 0, sizeof(indexStore));
   1438             uint32_t numAttribs = state.feNumAttributes;
   1439 
   1440             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
   1441                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim);
   1442             cutPA = true;
   1443         }
   1444         else
   1445 #endif
   1446         {
   1447             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
   1448             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim);
   1449             cutPA = false;
   1450         }
   1451 
   1452     }
   1453 
   1454     PA_STATE& GetPA()
   1455     {
   1456 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
   1457         if (cutPA)
   1458         {
   1459             return this->paCut;
   1460         }
   1461         else
   1462 #endif
   1463         {
   1464             return this->paOpt;
   1465         }
   1466     }
   1467 
   1468     PA_STATE_OPT paOpt;
   1469     PA_STATE_CUT paCut;
   1470 
   1471     bool cutPA{ false };
   1472 
   1473     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
   1474 
   1475     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
   1476 };
   1477