Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file pa.h
     24 *
     25 * @brief Definitions for primitive assembly.
     26 *        N primitives are assembled at a time, where N is the SIMD width.
     27 *        A state machine, that is specific for a given topology, drives the
     28 *        assembly of vertices into triangles.
     29 *
     30 ******************************************************************************/
     31 #pragma once
     32 
     33 #include "frontend.h"
     34 
     35 struct PA_STATE
     36 {
     37     DRAW_CONTEXT *pDC{ nullptr };              // draw context
     38     uint8_t* pStreamBase{ nullptr };           // vertex stream
     39     uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
     40 
     41     // The topology the binner will use. In some cases the FE changes the topology from the api state.
     42     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
     43 
     44     PA_STATE() {}
     45     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
     46         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
     47 
     48     virtual bool HasWork() = 0;
     49     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
     50     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
     51     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
     52     virtual bool NextPrim() = 0;
     53     virtual simdvertex& GetNextVsOutput() = 0;
     54     virtual bool GetNextStreamOutput() = 0;
     55     virtual simdmask& GetNextVsIndices() = 0;
     56     virtual uint32_t NumPrims() = 0;
     57     virtual void Reset() = 0;
     58     virtual simdscalari GetPrimID(uint32_t startID) = 0;
     59 };
     60 
     61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
     62 // output. Here is the sequence
     63 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
     64 //    2. Execute PA function to assemble and bin triangles.
     65 //        a.    The PA function is a set of functions that collectively make up the
     66 //            state machine for a given topology.
     67 //                1.    We use a state index to track which PA function to call.
     68 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
     69 //                1.    We call this the current and previous simd vertex.
     70 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
     71 //                    order to assemble the second triangle, for a triangle list, we'll need the
     72 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
     73 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
     74 //
     75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
     76 // cuts
     77 struct PA_STATE_OPT : public PA_STATE
     78 {
     79     simdvertex leadingVertex;            // For tri-fan
     80     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
     81     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
     82 
     83     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
     84 
     85     uint32_t cur{ 0 };                   // index to current VS output.
     86     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
     87     uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
     88 
     89     uint32_t counter{ 0 };               // state counter
     90     bool reset{ false };                 // reset state
     91 
     92     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
     93     simdscalari primID;
     94 
     95     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
     96     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
     97 
     98     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
     99     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
    100     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
    101 
    102     // state used to advance the PA when Next is called
    103     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
    104     uint32_t           nextNumSimdPrims{ 0 };
    105     uint32_t           nextNumPrimsIncrement{ 0 };
    106     bool               nextReset{ false };
    107     bool               isStreaming{ false };
    108 
    109     simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
    110 
    111     PA_STATE_OPT() {}
    112     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
    113         bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
    114 
    115     bool HasWork()
    116     {
    117         return (this->numPrimsComplete < this->numPrims) ? true : false;
    118     }
    119 
    120     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    121     {
    122         simdvertex* pVertex = (simdvertex*)pStreamBase;
    123         return pVertex[index].attrib[slot];
    124     }
    125 
    126     // Assembles 4 triangles. Each simdvector is a single vertex from 4
    127     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
    128     bool Assemble(uint32_t slot, simdvector verts[])
    129     {
    130         return this->pfnPaFunc(*this, slot, verts);
    131     }
    132 
    133     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
    134     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
    135     {
    136         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
    137     }
    138 
    139     bool NextPrim()
    140     {
    141         this->pfnPaFunc = this->pfnPaNextFunc;
    142         this->numSimdPrims = this->nextNumSimdPrims;
    143         this->numPrimsComplete += this->nextNumPrimsIncrement;
    144         this->reset = this->nextReset;
    145 
    146         if (this->isStreaming)
    147         {
    148             this->reset = false;
    149         }
    150 
    151         bool morePrims = false;
    152 
    153         if (this->numSimdPrims > 0)
    154         {
    155             morePrims = true;
    156             this->numSimdPrims--;
    157         }
    158         else
    159         {
    160             this->counter = (this->reset) ? 0 : (this->counter + 1);
    161             this->reset = false;
    162         }
    163 
    164         this->pfnPaFunc = this->pfnPaNextFunc;
    165 
    166         if (!HasWork())
    167         {
    168             morePrims = false;    // no more to do
    169         }
    170 
    171         return morePrims;
    172     }
    173 
    174     simdvertex& GetNextVsOutput()
    175     {
    176         // increment cur and prev indices
    177         const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
    178         this->prev = this->cur;  // prev is undefined for first state.
    179         this->cur = this->counter % numSimdVerts;
    180 
    181         simdvertex* pVertex = (simdvertex*)pStreamBase;
    182         return pVertex[this->cur];
    183     }
    184 
    185     simdmask& GetNextVsIndices()
    186     {
    187         // unused in optimized PA, pass tmp buffer back
    188         return tmpIndices;
    189     }
    190 
    191     bool GetNextStreamOutput()
    192     {
    193         this->prev = this->cur;
    194         this->cur = this->counter;
    195 
    196         return HasWork();
    197     }
    198 
    199     uint32_t NumPrims()
    200     {
    201         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
    202             (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
    203     }
    204 
    205     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
    206         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
    207         uint32_t numSimdPrims = 0,
    208         uint32_t numPrimsIncrement = 0,
    209         bool reset = false)
    210     {
    211         this->pfnPaNextFunc = pfnPaNextFunc;
    212         this->nextNumSimdPrims = numSimdPrims;
    213         this->nextNumPrimsIncrement = numPrimsIncrement;
    214         this->nextReset = reset;
    215 
    216         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
    217     }
    218 
    219     void Reset()
    220     {
    221         this->pfnPaFunc = this->pfnPaFuncReset;
    222         this->numPrimsComplete = 0;
    223         this->numSimdPrims = 0;
    224         this->cur = 0;
    225         this->prev = 0;
    226         this->first = 0;
    227         this->counter = 0;
    228         this->reset = false;
    229     }
    230 
    231     simdscalari GetPrimID(uint32_t startID)
    232     {
    233         return _simd_add_epi32(this->primID,
    234             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
    235     }
    236 };
    237 
    238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
    239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
    240     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
    241     uint32_t numSimdPrims = 0,
    242     uint32_t numPrimsIncrement = 0,
    243     bool reset = false)
    244 {
    245     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
    246 }
    247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
    248 {
    249     return pa.GetSimdVector(index, slot);
    250 }
    251 
    252 INLINE __m128 swizzleLane0(const simdvector &a)
    253 {
    254     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
    255     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
    256     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
    257 }
    258 
    259 INLINE __m128 swizzleLane1(const simdvector &a)
    260 {
    261     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
    262     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
    263     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
    264 }
    265 
    266 INLINE __m128 swizzleLane2(const simdvector &a)
    267 {
    268     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
    269     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
    270     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
    271 }
    272 
    273 INLINE __m128 swizzleLane3(const simdvector &a)
    274 {
    275     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
    276     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
    277     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
    278 }
    279 
    280 INLINE __m128 swizzleLane4(const simdvector &a)
    281 {
    282     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
    283     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
    284     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
    285 
    286 }
    287 
    288 INLINE __m128 swizzleLane5(const simdvector &a)
    289 {
    290     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
    291     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
    292     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
    293 }
    294 
    295 INLINE __m128 swizzleLane6(const simdvector &a)
    296 {
    297     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
    298     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
    299     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
    300 }
    301 
    302 INLINE __m128 swizzleLane7(const simdvector &a)
    303 {
    304     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
    305     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
    306     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
    307 }
    308 
    309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
    310 {
    311     switch (lane) {
    312     case 0:
    313         return swizzleLane0(a);
    314     case 1:
    315         return swizzleLane1(a);
    316     case 2:
    317         return swizzleLane2(a);
    318     case 3:
    319         return swizzleLane3(a);
    320     case 4:
    321         return swizzleLane4(a);
    322     case 5:
    323         return swizzleLane5(a);
    324     case 6:
    325         return swizzleLane6(a);
    326     case 7:
    327         return swizzleLane7(a);
    328     default:
    329         return _mm_setzero_ps();
    330     }
    331 }
    332 
    333 // Cut-aware primitive assembler.
    334 struct PA_STATE_CUT : public PA_STATE
    335 {
    336     simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
    337     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
    338     uint32_t numAttribs{ 0 };            // number of attributes
    339     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
    340     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
    341     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
    342     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
    343     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
    344     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
    345     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
    346     uint32_t curVertex{ 0 };             // current unprocessed vertex
    347     uint32_t startPrimId{ 0 };           // starting prim id
    348     simdscalari vPrimId;                 // vector of prim ID
    349     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
    350     uint32_t vertsPerPrim{ 0 };
    351     simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
    352     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
    353                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
    354                                          // while the GS sends valid verts for every index
    355     // Topology state tracking
    356     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
    357     uint32_t curIndex{ 0 };
    358     bool reverseWinding{ false };        // indicates reverse winding for strips
    359     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
    360 
    361     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
    362     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
    363 
    364     PA_STATE_CUT() {}
    365     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
    366         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
    367         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
    368     {
    369         numVerts = in_streamSizeInVerts;
    370         numAttribs = in_numAttribs;
    371         binTopology = topo;
    372         needOffsets = false;
    373         processCutVerts = in_processCutVerts;
    374 
    375         numVertsToAssemble = numRemainingVerts = in_numVerts;
    376         numPrimsAssembled = 0;
    377         headVertex = tailVertex = curVertex = 0;
    378 
    379         curIndex = 0;
    380         pCutIndices = in_pIndices;
    381         memset(indices, 0, sizeof(indices));
    382         vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
    383         reverseWinding = false;
    384         adjExtraVert = -1;
    385 
    386         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
    387         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
    388 
    389         switch (topo)
    390         {
    391         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
    392         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
    393         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
    394         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
    395                                     {
    396                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
    397                                     }
    398                                     else
    399                                     {
    400                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
    401                                     }
    402                                     break;
    403 
    404         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
    405         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
    406         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
    407         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
    408         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
    409         default: assert(0 && "Unimplemented topology");
    410         }
    411     }
    412 
    413     simdvertex& GetNextVsOutput()
    414     {
    415         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
    416         this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
    417         this->needOffsets = true;
    418         return ((simdvertex*)pStreamBase)[vertexIndex];
    419     }
    420 
    421     simdmask& GetNextVsIndices()
    422     {
    423         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
    424         simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
    425         return *pCurCutIndex;
    426     }
    427 
    428     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    429     {
    430         // unused
    431         SWR_ASSERT(0 && "Not implemented");
    432         return this->tmpVertex.attrib[0];
    433     }
    434 
    435     bool GetNextStreamOutput()
    436     {
    437         this->headVertex += KNOB_SIMD_WIDTH;
    438         this->needOffsets = true;
    439         return HasWork();
    440     }
    441 
    442     simdscalari GetPrimID(uint32_t startID)
    443     {
    444         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
    445     }
    446 
    447     void Reset()
    448     {
    449         this->numRemainingVerts = this->numVertsToAssemble;
    450         this->numPrimsAssembled = 0;
    451         this->curIndex = 0;
    452         this->curVertex = 0;
    453         this->tailVertex = 0;
    454         this->headVertex = 0;
    455         this->reverseWinding = false;
    456         this->adjExtraVert = -1;
    457         this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
    458     }
    459 
    460     bool HasWork()
    461     {
    462         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
    463     }
    464 
    465     bool IsVertexStoreFull()
    466     {
    467         return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
    468     }
    469 
    470     void RestartTopology()
    471     {
    472         this->curIndex = 0;
    473         this->reverseWinding = false;
    474         this->adjExtraVert = -1;
    475     }
    476 
    477     bool IsCutIndex(uint32_t vertex)
    478     {
    479         uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
    480         uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
    481         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
    482     }
    483 
    484     // iterates across the unprocessed verts until we hit the end or we
    485     // have assembled SIMD prims
    486     void ProcessVerts()
    487     {
    488         while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
    489             this->numRemainingVerts > 0 &&
    490             this->curVertex != this->headVertex)
    491         {
    492             // if cut index, restart topology
    493             if (IsCutIndex(this->curVertex))
    494             {
    495                 if (this->processCutVerts)
    496                 {
    497                     (this->*pfnPa)(this->curVertex, false);
    498                 }
    499                 // finish off tri strip w/ adj before restarting topo
    500                 if (this->adjExtraVert != -1)
    501                 {
    502                     (this->*pfnPa)(this->curVertex, true);
    503                 }
    504                 RestartTopology();
    505             }
    506             else
    507             {
    508                 (this->*pfnPa)(this->curVertex, false);
    509             }
    510 
    511             this->curVertex++;
    512             if (this->curVertex >= this->numVerts) {
    513                this->curVertex = 0;
    514             }
    515             this->numRemainingVerts--;
    516         }
    517 
    518         // special case last primitive for tri strip w/ adj
    519         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
    520         {
    521             (this->*pfnPa)(this->curVertex, true);
    522         }
    523     }
    524 
    525     void Advance()
    526     {
    527         // done with current batch
    528         // advance tail to the current unsubmitted vertex
    529         this->tailVertex = this->curVertex;
    530         this->numPrimsAssembled = 0;
    531         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
    532     }
    533 
    534     bool NextPrim()
    535     {
    536         // if we've assembled enough prims, we can advance to the next set of verts
    537         if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
    538         {
    539             Advance();
    540         }
    541         return false;
    542     }
    543 
    544     void ComputeOffsets()
    545     {
    546         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    547         {
    548             simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
    549 
    550             // step to simdvertex batch
    551             const uint32_t simdShift = 3; // @todo make knob
    552             simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
    553             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
    554 
    555             // step to index
    556             const uint32_t simdMask = 0x7; // @todo make knob
    557             simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
    558             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
    559         }
    560     }
    561 
    562     bool Assemble(uint32_t slot, simdvector result[])
    563     {
    564         // process any outstanding verts
    565         ProcessVerts();
    566 
    567         // return false if we don't have enough prims assembled
    568         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
    569         {
    570             return false;
    571         }
    572 
    573         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
    574         if (this->needOffsets)
    575         {
    576             ComputeOffsets();
    577             this->needOffsets = false;
    578         }
    579 
    580         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    581         {
    582             simdscalari offsets = this->vOffsets[v];
    583 
    584             // step to attribute
    585             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
    586 
    587             float* pBase = (float*)this->pStreamBase;
    588             for (uint32_t c = 0; c < 4; ++c)
    589             {
    590                 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
    591 
    592                 // move base to next component
    593                 pBase += KNOB_SIMD_WIDTH;
    594             }
    595         }
    596 
    597         return true;
    598     }
    599 
    600     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
    601     {
    602         // move to slot
    603         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
    604         {
    605             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
    606             uint32_t offset = pOffset[triIndex];
    607             offset += sizeof(simdvector) * slot;
    608             float* pVert = (float*)&tri[v];
    609             for (uint32_t c = 0; c < 4; ++c)
    610             {
    611                 float* pComponent = (float*)(this->pStreamBase + offset);
    612                 pVert[c] = *pComponent;
    613                 offset += KNOB_SIMD_WIDTH * sizeof(float);
    614             }
    615         }
    616     }
    617 
    618     uint32_t NumPrims()
    619     {
    620         return this->numPrimsAssembled;
    621     }
    622 
    623     // Per-topology functions
    624     void ProcessVertTriStrip(uint32_t index, bool finish)
    625     {
    626         this->vert[this->curIndex] = index;
    627         this->curIndex++;
    628         if (this->curIndex == 3)
    629         {
    630             // assembled enough verts for prim, add to gather indices
    631             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    632             if (reverseWinding)
    633             {
    634                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
    635                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
    636             }
    637             else
    638             {
    639                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
    640                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
    641             }
    642 
    643             // increment numPrimsAssembled
    644             this->numPrimsAssembled++;
    645 
    646             // set up next prim state
    647             this->vert[0] = this->vert[1];
    648             this->vert[1] = this->vert[2];
    649             this->curIndex = 2;
    650             this->reverseWinding ^= 1;
    651         }
    652     }
    653 
    654     template<bool gsEnabled>
    655     void AssembleTriStripAdj()
    656     {
    657         if (!gsEnabled)
    658         {
    659             this->vert[1] = this->vert[2];
    660             this->vert[2] = this->vert[4];
    661 
    662             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    663             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    664             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    665 
    666             this->vert[4] = this->vert[2];
    667             this->vert[2] = this->vert[1];
    668         }
    669         else
    670         {
    671             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    672             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    673             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    674             this->indices[3][this->numPrimsAssembled] = this->vert[3];
    675             this->indices[4][this->numPrimsAssembled] = this->vert[4];
    676             this->indices[5][this->numPrimsAssembled] = this->vert[5];
    677         }
    678         this->numPrimsAssembled++;
    679     }
    680 
    681 
    682     template<bool gsEnabled>
    683     void ProcessVertTriStripAdj(uint32_t index, bool finish)
    684     {
    685         // handle last primitive of tristrip
    686         if (finish && this->adjExtraVert != -1)
    687         {
    688             this->vert[3] = this->adjExtraVert;
    689             AssembleTriStripAdj<gsEnabled>();
    690             this->adjExtraVert = -1;
    691             return;
    692         }
    693 
    694         switch (this->curIndex)
    695         {
    696         case 0:
    697         case 1:
    698         case 2:
    699         case 4:
    700             this->vert[this->curIndex] = index;
    701             this->curIndex++;
    702             break;
    703         case 3:
    704             this->vert[5] = index;
    705             this->curIndex++;
    706             break;
    707         case 5:
    708             if (this->adjExtraVert == -1)
    709             {
    710                 this->adjExtraVert = index;
    711             }
    712             else
    713             {
    714                 this->vert[3] = index;
    715                 if (!gsEnabled)
    716                 {
    717                     AssembleTriStripAdj<gsEnabled>();
    718 
    719                     uint32_t nextTri[6];
    720                     if (this->reverseWinding)
    721                     {
    722                         nextTri[0] = this->vert[4];
    723                         nextTri[1] = this->vert[0];
    724                         nextTri[2] = this->vert[2];
    725                         nextTri[4] = this->vert[3];
    726                         nextTri[5] = this->adjExtraVert;
    727                     }
    728                     else
    729                     {
    730                         nextTri[0] = this->vert[2];
    731                         nextTri[1] = this->adjExtraVert;
    732                         nextTri[2] = this->vert[3];
    733                         nextTri[4] = this->vert[4];
    734                         nextTri[5] = this->vert[0];
    735                     }
    736                     for (uint32_t i = 0; i < 6; ++i)
    737                     {
    738                         this->vert[i] = nextTri[i];
    739                     }
    740 
    741                     this->adjExtraVert = -1;
    742                     this->reverseWinding ^= 1;
    743                 }
    744                 else
    745                 {
    746                     this->curIndex++;
    747                 }
    748             }
    749             break;
    750         case 6:
    751             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
    752             AssembleTriStripAdj<gsEnabled>();
    753 
    754             uint32_t nextTri[6];
    755             if (this->reverseWinding)
    756             {
    757                 nextTri[0] = this->vert[4];
    758                 nextTri[1] = this->vert[0];
    759                 nextTri[2] = this->vert[2];
    760                 nextTri[4] = this->vert[3];
    761                 nextTri[5] = this->adjExtraVert;
    762             }
    763             else
    764             {
    765                 nextTri[0] = this->vert[2];
    766                 nextTri[1] = this->adjExtraVert;
    767                 nextTri[2] = this->vert[3];
    768                 nextTri[4] = this->vert[4];
    769                 nextTri[5] = this->vert[0];
    770             }
    771             for (uint32_t i = 0; i < 6; ++i)
    772             {
    773                 this->vert[i] = nextTri[i];
    774             }
    775             this->reverseWinding ^= 1;
    776             this->adjExtraVert = index;
    777             this->curIndex--;
    778             break;
    779         }
    780     }
    781 
    782     void ProcessVertTriList(uint32_t index, bool finish)
    783     {
    784         this->vert[this->curIndex] = index;
    785         this->curIndex++;
    786         if (this->curIndex == 3)
    787         {
    788             // assembled enough verts for prim, add to gather indices
    789             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    790             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    791             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    792 
    793             // increment numPrimsAssembled
    794             this->numPrimsAssembled++;
    795 
    796             // set up next prim state
    797             this->curIndex = 0;
    798         }
    799     }
    800 
    801     void ProcessVertTriListAdj(uint32_t index, bool finish)
    802     {
    803         this->vert[this->curIndex] = index;
    804         this->curIndex++;
    805         if (this->curIndex == 6)
    806         {
    807             // assembled enough verts for prim, add to gather indices
    808             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    809             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    810             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    811             this->indices[3][this->numPrimsAssembled] = this->vert[3];
    812             this->indices[4][this->numPrimsAssembled] = this->vert[4];
    813             this->indices[5][this->numPrimsAssembled] = this->vert[5];
    814 
    815             // increment numPrimsAssembled
    816             this->numPrimsAssembled++;
    817 
    818             // set up next prim state
    819             this->curIndex = 0;
    820         }
    821     }
    822 
    823     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
    824     {
    825         this->vert[this->curIndex] = index;
    826         this->curIndex++;
    827         if (this->curIndex == 6)
    828         {
    829             // assembled enough verts for prim, add to gather indices
    830             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    831             this->indices[1][this->numPrimsAssembled] = this->vert[2];
    832             this->indices[2][this->numPrimsAssembled] = this->vert[4];
    833 
    834             // increment numPrimsAssembled
    835             this->numPrimsAssembled++;
    836 
    837             // set up next prim state
    838             this->curIndex = 0;
    839         }
    840     }
    841 
    842 
    843     void ProcessVertLineList(uint32_t index, bool finish)
    844     {
    845         this->vert[this->curIndex] = index;
    846         this->curIndex++;
    847         if (this->curIndex == 2)
    848         {
    849             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    850             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    851 
    852             this->numPrimsAssembled++;
    853             this->curIndex = 0;
    854         }
    855     }
    856 
    857     void ProcessVertLineStrip(uint32_t index, bool finish)
    858     {
    859         this->vert[this->curIndex] = index;
    860         this->curIndex++;
    861         if (this->curIndex == 2)
    862         {
    863             // assembled enough verts for prim, add to gather indices
    864             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    865             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    866 
    867             // increment numPrimsAssembled
    868             this->numPrimsAssembled++;
    869 
    870             // set up next prim state
    871             this->vert[0] = this->vert[1];
    872             this->curIndex = 1;
    873         }
    874     }
    875 
    876     void ProcessVertLineStripAdj(uint32_t index, bool finish)
    877     {
    878         this->vert[this->curIndex] = index;
    879         this->curIndex++;
    880         if (this->curIndex == 4)
    881         {
    882             // assembled enough verts for prim, add to gather indices
    883             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    884             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    885             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    886             this->indices[3][this->numPrimsAssembled] = this->vert[3];
    887 
    888             // increment numPrimsAssembled
    889             this->numPrimsAssembled++;
    890 
    891             // set up next prim state
    892             this->vert[0] = this->vert[1];
    893             this->vert[1] = this->vert[2];
    894             this->vert[2] = this->vert[3];
    895             this->curIndex = 3;
    896         }
    897     }
    898 
    899     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
    900     {
    901         this->vert[this->curIndex] = index;
    902         this->curIndex++;
    903         if (this->curIndex == 4)
    904         {
    905             // assembled enough verts for prim, add to gather indices
    906             this->indices[0][this->numPrimsAssembled] = this->vert[1];
    907             this->indices[1][this->numPrimsAssembled] = this->vert[2];
    908 
    909             // increment numPrimsAssembled
    910             this->numPrimsAssembled++;
    911 
    912             // set up next prim state
    913             this->vert[0] = this->vert[1];
    914             this->vert[1] = this->vert[2];
    915             this->vert[2] = this->vert[3];
    916             this->curIndex = 3;
    917         }
    918     }
    919 
    920     void ProcessVertLineListAdj(uint32_t index, bool finish)
    921     {
    922         this->vert[this->curIndex] = index;
    923         this->curIndex++;
    924         if (this->curIndex == 4)
    925         {
    926             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    927             this->indices[1][this->numPrimsAssembled] = this->vert[1];
    928             this->indices[2][this->numPrimsAssembled] = this->vert[2];
    929             this->indices[3][this->numPrimsAssembled] = this->vert[3];
    930 
    931             this->numPrimsAssembled++;
    932             this->curIndex = 0;
    933         }
    934     }
    935 
    936     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
    937     {
    938         this->vert[this->curIndex] = index;
    939         this->curIndex++;
    940         if (this->curIndex == 4)
    941         {
    942             this->indices[0][this->numPrimsAssembled] = this->vert[1];
    943             this->indices[1][this->numPrimsAssembled] = this->vert[2];
    944 
    945             this->numPrimsAssembled++;
    946             this->curIndex = 0;
    947         }
    948     }
    949 
    950     void ProcessVertPointList(uint32_t index, bool finish)
    951     {
    952         this->vert[this->curIndex] = index;
    953         this->curIndex++;
    954         if (this->curIndex == 1)
    955         {
    956             this->indices[0][this->numPrimsAssembled] = this->vert[0];
    957             this->numPrimsAssembled++;
    958             this->curIndex = 0;
    959         }
    960     }
    961 };
    962 
    963 // Primitive Assembly for data output from the DomainShader.
    964 struct PA_TESS : PA_STATE
    965 {
    966     PA_TESS(
    967         DRAW_CONTEXT *in_pDC,
    968         const simdscalar* in_pVertData,
    969         uint32_t in_attributeStrideInVectors,
    970         uint32_t in_numAttributes,
    971         uint32_t* (&in_ppIndices)[3],
    972         uint32_t in_numPrims,
    973         PRIMITIVE_TOPOLOGY in_binTopology) :
    974 
    975         PA_STATE(in_pDC, nullptr, 0),
    976         m_pVertexData(in_pVertData),
    977         m_attributeStrideInVectors(in_attributeStrideInVectors),
    978         m_numAttributes(in_numAttributes),
    979         m_numPrims(in_numPrims)
    980     {
    981         m_vPrimId = _simd_setzero_si();
    982         binTopology = in_binTopology;
    983         m_ppIndices[0] = in_ppIndices[0];
    984         m_ppIndices[1] = in_ppIndices[1];
    985         m_ppIndices[2] = in_ppIndices[2];
    986 
    987         switch (binTopology)
    988         {
    989         case TOP_POINT_LIST:
    990             m_numVertsPerPrim = 1;
    991             break;
    992 
    993         case TOP_LINE_LIST:
    994             m_numVertsPerPrim = 2;
    995             break;
    996 
    997         case TOP_TRIANGLE_LIST:
    998             m_numVertsPerPrim = 3;
    999             break;
   1000 
   1001         default:
   1002             SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
   1003             break;
   1004         }
   1005     }
   1006 
   1007     bool HasWork()
   1008     {
   1009         return m_numPrims != 0;
   1010     }
   1011 
   1012     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
   1013     {
   1014         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
   1015         static simdvector junk;
   1016         return junk;
   1017     }
   1018 
   1019     static simdscalari GenPrimMask(uint32_t numPrims)
   1020     {
   1021         SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
   1022 #if KNOB_SIMD_WIDTH == 8
   1023         static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
   1024         {
   1025             -1, -1, -1, -1, -1, -1, -1, -1,
   1026              0,  0,  0,  0,  0,  0,  0,  0
   1027         };
   1028 #elif KNOB_SIMD_WIDTH == 16
   1029         static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
   1030         {
   1031             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1032              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
   1033         };
   1034 #else
   1035 #error "Help, help, I can't get up!"
   1036 #endif
   1037 
   1038         return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
   1039     }
   1040 
   1041     bool Assemble(uint32_t slot, simdvector verts[])
   1042     {
   1043         static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
   1044         SWR_ASSERT(slot < m_numAttributes);
   1045 
   1046         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
   1047         if (0 == numPrimsToAssemble)
   1048         {
   1049             return false;
   1050         }
   1051 
   1052         simdscalari mask = GenPrimMask(numPrimsToAssemble);
   1053 
   1054         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
   1055         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
   1056         {
   1057             simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
   1058 
   1059             const float* pBase = pBaseAttrib;
   1060             for (uint32_t c = 0; c < 4; ++c)
   1061             {
   1062                 verts[i].v[c] = _simd_mask_i32gather_ps(
   1063                     _simd_setzero_ps(),
   1064                     pBase,
   1065                     indices,
   1066                     _simd_castsi_ps(mask),
   1067                     4 /* gcc doesn't like sizeof(float) */);
   1068                 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
   1069             }
   1070         }
   1071 
   1072         return true;
   1073     }
   1074 
   1075     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
   1076     {
   1077         SWR_ASSERT(slot < m_numAttributes);
   1078         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
   1079 
   1080         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
   1081         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
   1082         {
   1083             uint32_t index = m_ppIndices[i][primIndex];
   1084             const float* pVertData = pVertDataBase;
   1085             float* pVert = (float*)&verts[i];
   1086 
   1087             for (uint32_t c = 0; c < 4; ++c)
   1088             {
   1089                 pVert[c] = pVertData[index];
   1090                 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
   1091             }
   1092         }
   1093     }
   1094 
   1095     bool NextPrim()
   1096     {
   1097         uint32_t numPrims = PA_TESS::NumPrims();
   1098         m_numPrims -= numPrims;
   1099         m_ppIndices[0] += numPrims;
   1100         m_ppIndices[1] += numPrims;
   1101         m_ppIndices[2] += numPrims;
   1102 
   1103         return HasWork();
   1104     }
   1105 
   1106     simdvertex& GetNextVsOutput()
   1107     {
   1108         SWR_ASSERT(0, "%s", __FUNCTION__);
   1109         static simdvertex junk;
   1110         return junk;
   1111     }
   1112 
   1113     bool GetNextStreamOutput()
   1114     {
   1115         SWR_ASSERT(0, "%s", __FUNCTION__);
   1116         return false;
   1117     }
   1118 
   1119     simdmask& GetNextVsIndices()
   1120     {
   1121         SWR_ASSERT(0, "%s", __FUNCTION__);
   1122         static simdmask junk;
   1123         return junk;
   1124     }
   1125 
   1126     uint32_t NumPrims()
   1127     {
   1128         return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
   1129     }
   1130 
   1131     void Reset() { SWR_ASSERT(0); };
   1132 
   1133     simdscalari GetPrimID(uint32_t startID)
   1134     {
   1135         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
   1136     }
   1137 
   1138 private:
   1139     const simdscalar*   m_pVertexData = nullptr;
   1140     uint32_t            m_attributeStrideInVectors = 0;
   1141     uint32_t            m_numAttributes = 0;
   1142     uint32_t            m_numPrims = 0;
   1143     uint32_t*           m_ppIndices[3];
   1144 
   1145     uint32_t            m_numVertsPerPrim = 0;
   1146 
   1147     simdscalari         m_vPrimId;
   1148 };
   1149 
   1150 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
   1151 // based on state.
   1152 template <typename IsIndexedT, typename IsCutIndexEnabledT>
   1153 struct PA_FACTORY
   1154 {
   1155     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
   1156     {
   1157 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
   1158         const API_STATE& state = GetApiState(pDC);
   1159         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
   1160             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
   1161             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
   1162             topo == TOP_TRIANGLE_LIST)) ||
   1163 
   1164             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
   1165             // for them in the optimized PA
   1166             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
   1167         {
   1168             memset(&indexStore, 0, sizeof(indexStore));
   1169             uint32_t numAttribs = state.feNumAttributes;
   1170 
   1171             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
   1172                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
   1173             cutPA = true;
   1174         }
   1175         else
   1176 #endif
   1177         {
   1178             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
   1179             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
   1180             cutPA = false;
   1181         }
   1182 
   1183     }
   1184 
   1185     PA_STATE& GetPA()
   1186     {
   1187 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
   1188         if (cutPA)
   1189         {
   1190             return this->paCut;
   1191         }
   1192         else
   1193 #endif
   1194         {
   1195             return this->paOpt;
   1196         }
   1197     }
   1198 
   1199     PA_STATE_OPT paOpt;
   1200     PA_STATE_CUT paCut;
   1201     bool cutPA{ false };
   1202 
   1203     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
   1204 
   1205     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
   1206     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
   1207 };
   1208