Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file frontend.cpp
     24 *
     25 * @brief Implementation for Frontend which handles vertex processing,
     26 *        primitive assembly, clipping, binning, etc.
     27 *
     28 ******************************************************************************/
     29 
     30 #include "api.h"
     31 #include "frontend.h"
     32 #include "backend.h"
     33 #include "context.h"
     34 #include "rdtsc_core.h"
     35 #include "utils.h"
     36 #include "threads.h"
     37 #include "pa.h"
     38 #include "clip.h"
     39 #include "tilemgr.h"
     40 #include "tessellator.h"
     41 #include <limits>
     42 #include <iostream>
     43 
     44 //////////////////////////////////////////////////////////////////////////
     45 /// @brief Helper macro to generate a bitmask
     46 static INLINE uint32_t GenMask(uint32_t numBits)
     47 {
     48     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
     49     return ((1U << numBits) - 1);
     50 }
     51 
     52 //////////////////////////////////////////////////////////////////////////
     53 /// @brief FE handler for SwrSync.
     54 /// @param pContext - pointer to SWR context.
     55 /// @param pDC - pointer to draw context.
     56 /// @param workerId - thread's worker id. Even thread has a unique id.
     57 /// @param pUserData - Pointer to user data passed back to sync callback.
     58 /// @todo This should go away when we switch this to use compute threading.
     59 void ProcessSync(
     60     SWR_CONTEXT *pContext,
     61     DRAW_CONTEXT *pDC,
     62     uint32_t workerId,
     63     void *pUserData)
     64 {
     65     BE_WORK work;
     66     work.type = SYNC;
     67     work.pfnWork = ProcessSyncBE;
     68 
     69     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     70     pTileMgr->enqueue(0, 0, &work);
     71 }
     72 
     73 //////////////////////////////////////////////////////////////////////////
     74 /// @brief FE handler for SwrDestroyContext.
     75 /// @param pContext - pointer to SWR context.
     76 /// @param pDC - pointer to draw context.
     77 /// @param workerId - thread's worker id. Even thread has a unique id.
     78 /// @param pUserData - Pointer to user data passed back to sync callback.
     79 void ProcessShutdown(
     80     SWR_CONTEXT *pContext,
     81     DRAW_CONTEXT *pDC,
     82     uint32_t workerId,
     83     void *pUserData)
     84 {
     85     BE_WORK work;
     86     work.type = SHUTDOWN;
     87     work.pfnWork = ProcessShutdownBE;
     88 
     89     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     90     // Enqueue at least 1 work item for each worker thread
     91     // account for number of numa nodes
     92     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
     93 
     94     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
     95     {
     96         for (uint32_t n = 0; n < numNumaNodes; ++n)
     97         {
     98             pTileMgr->enqueue(i, n, &work);
     99         }
    100     }
    101 }
    102 
    103 //////////////////////////////////////////////////////////////////////////
    104 /// @brief FE handler for SwrClearRenderTarget.
    105 /// @param pContext - pointer to SWR context.
    106 /// @param pDC - pointer to draw context.
    107 /// @param workerId - thread's worker id. Even thread has a unique id.
    108 /// @param pUserData - Pointer to user data passed back to clear callback.
    109 /// @todo This should go away when we switch this to use compute threading.
    110 void ProcessClear(
    111     SWR_CONTEXT *pContext,
    112     DRAW_CONTEXT *pDC,
    113     uint32_t workerId,
    114     void *pUserData)
    115 {
    116     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
    117     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    118 
    119     // queue a clear to each macro tile
    120     // compute macro tile bounds for the specified rect
    121     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
    122     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
    123     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
    124     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
    125 
    126     BE_WORK work;
    127     work.type = CLEAR;
    128     work.pfnWork = ProcessClearBE;
    129     work.desc.clear = *pDesc;
    130 
    131     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
    132     {
    133         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
    134         {
    135             pTileMgr->enqueue(x, y, &work);
    136         }
    137     }
    138 }
    139 
    140 //////////////////////////////////////////////////////////////////////////
    141 /// @brief FE handler for SwrStoreTiles.
    142 /// @param pContext - pointer to SWR context.
    143 /// @param pDC - pointer to draw context.
    144 /// @param workerId - thread's worker id. Even thread has a unique id.
    145 /// @param pUserData - Pointer to user data passed back to callback.
    146 /// @todo This should go away when we switch this to use compute threading.
    147 void ProcessStoreTiles(
    148     SWR_CONTEXT *pContext,
    149     DRAW_CONTEXT *pDC,
    150     uint32_t workerId,
    151     void *pUserData)
    152 {
    153     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
    154     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    155     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
    156 
    157     // queue a store to each macro tile
    158     // compute macro tile bounds for the specified rect
    159     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
    160     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
    161     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
    162     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
    163 
    164     // store tiles
    165     BE_WORK work;
    166     work.type = STORETILES;
    167     work.pfnWork = ProcessStoreTilesBE;
    168     work.desc.storeTiles = *pDesc;
    169 
    170     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
    171     {
    172         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
    173         {
    174             pTileMgr->enqueue(x, y, &work);
    175         }
    176     }
    177 
    178     AR_END(FEProcessStoreTiles, 0);
    179 }
    180 
    181 //////////////////////////////////////////////////////////////////////////
    182 /// @brief FE handler for SwrInvalidateTiles.
    183 /// @param pContext - pointer to SWR context.
    184 /// @param pDC - pointer to draw context.
    185 /// @param workerId - thread's worker id. Even thread has a unique id.
    186 /// @param pUserData - Pointer to user data passed back to callback.
    187 /// @todo This should go away when we switch this to use compute threading.
    188 void ProcessDiscardInvalidateTiles(
    189     SWR_CONTEXT *pContext,
    190     DRAW_CONTEXT *pDC,
    191     uint32_t workerId,
    192     void *pUserData)
    193 {
    194     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
    195     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
    196     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    197 
    198     // compute macro tile bounds for the specified rect
    199     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
    200     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
    201     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
    202     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
    203 
    204     if (pDesc->fullTilesOnly == false)
    205     {
    206         // include partial tiles
    207         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
    208         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
    209         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
    210         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
    211     }
    212 
    213     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
    214     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
    215 
    216     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
    217     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
    218 
    219     // load tiles
    220     BE_WORK work;
    221     work.type = DISCARDINVALIDATETILES;
    222     work.pfnWork = ProcessDiscardInvalidateTilesBE;
    223     work.desc.discardInvalidateTiles = *pDesc;
    224 
    225     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
    226     {
    227         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
    228         {
    229             pTileMgr->enqueue(x, y, &work);
    230         }
    231     }
    232 
    233     AR_END(FEProcessInvalidateTiles, 0);
    234 }
    235 
    236 //////////////////////////////////////////////////////////////////////////
    237 /// @brief Computes the number of primitives given the number of verts.
    238 /// @param mode - primitive topology for draw operation.
    239 /// @param numPrims - number of vertices or indices for draw.
    240 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
    241 uint32_t GetNumPrims(
    242     PRIMITIVE_TOPOLOGY mode,
    243     uint32_t numPrims)
    244 {
    245     switch (mode)
    246     {
    247     case TOP_POINT_LIST: return numPrims;
    248     case TOP_TRIANGLE_LIST: return numPrims / 3;
    249     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
    250     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
    251     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
    252     case TOP_QUAD_LIST: return numPrims / 4;
    253     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
    254     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
    255     case TOP_LINE_LIST: return numPrims / 2;
    256     case TOP_LINE_LOOP: return numPrims;
    257     case TOP_RECT_LIST: return numPrims / 3;
    258     case TOP_LINE_LIST_ADJ: return numPrims / 4;
    259     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
    260     case TOP_TRI_LIST_ADJ: return numPrims / 6;
    261     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
    262 
    263     case TOP_PATCHLIST_1:
    264     case TOP_PATCHLIST_2:
    265     case TOP_PATCHLIST_3:
    266     case TOP_PATCHLIST_4:
    267     case TOP_PATCHLIST_5:
    268     case TOP_PATCHLIST_6:
    269     case TOP_PATCHLIST_7:
    270     case TOP_PATCHLIST_8:
    271     case TOP_PATCHLIST_9:
    272     case TOP_PATCHLIST_10:
    273     case TOP_PATCHLIST_11:
    274     case TOP_PATCHLIST_12:
    275     case TOP_PATCHLIST_13:
    276     case TOP_PATCHLIST_14:
    277     case TOP_PATCHLIST_15:
    278     case TOP_PATCHLIST_16:
    279     case TOP_PATCHLIST_17:
    280     case TOP_PATCHLIST_18:
    281     case TOP_PATCHLIST_19:
    282     case TOP_PATCHLIST_20:
    283     case TOP_PATCHLIST_21:
    284     case TOP_PATCHLIST_22:
    285     case TOP_PATCHLIST_23:
    286     case TOP_PATCHLIST_24:
    287     case TOP_PATCHLIST_25:
    288     case TOP_PATCHLIST_26:
    289     case TOP_PATCHLIST_27:
    290     case TOP_PATCHLIST_28:
    291     case TOP_PATCHLIST_29:
    292     case TOP_PATCHLIST_30:
    293     case TOP_PATCHLIST_31:
    294     case TOP_PATCHLIST_32:
    295         return numPrims / (mode - TOP_PATCHLIST_BASE);
    296 
    297     case TOP_POLYGON:
    298     case TOP_POINT_LIST_BF:
    299     case TOP_LINE_STRIP_CONT:
    300     case TOP_LINE_STRIP_BF:
    301     case TOP_LINE_STRIP_CONT_BF:
    302     case TOP_TRIANGLE_FAN_NOSTIPPLE:
    303     case TOP_TRI_STRIP_REVERSE:
    304     case TOP_PATCHLIST_BASE:
    305     case TOP_UNKNOWN:
    306         SWR_INVALID("Unsupported topology: %d", mode);
    307         return 0;
    308     }
    309 
    310     return 0;
    311 }
    312 
    313 //////////////////////////////////////////////////////////////////////////
    314 /// @brief Computes the number of verts given the number of primitives.
    315 /// @param mode - primitive topology for draw operation.
    316 /// @param numPrims - number of primitives for draw.
    317 uint32_t GetNumVerts(
    318     PRIMITIVE_TOPOLOGY mode,
    319     uint32_t numPrims)
    320 {
    321     switch (mode)
    322     {
    323     case TOP_POINT_LIST: return numPrims;
    324     case TOP_TRIANGLE_LIST: return numPrims * 3;
    325     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
    326     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
    327     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
    328     case TOP_QUAD_LIST: return numPrims * 4;
    329     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
    330     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
    331     case TOP_LINE_LIST: return numPrims * 2;
    332     case TOP_LINE_LOOP: return numPrims;
    333     case TOP_RECT_LIST: return numPrims * 3;
    334     case TOP_LINE_LIST_ADJ: return numPrims * 4;
    335     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
    336     case TOP_TRI_LIST_ADJ: return numPrims * 6;
    337     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
    338 
    339     case TOP_PATCHLIST_1:
    340     case TOP_PATCHLIST_2:
    341     case TOP_PATCHLIST_3:
    342     case TOP_PATCHLIST_4:
    343     case TOP_PATCHLIST_5:
    344     case TOP_PATCHLIST_6:
    345     case TOP_PATCHLIST_7:
    346     case TOP_PATCHLIST_8:
    347     case TOP_PATCHLIST_9:
    348     case TOP_PATCHLIST_10:
    349     case TOP_PATCHLIST_11:
    350     case TOP_PATCHLIST_12:
    351     case TOP_PATCHLIST_13:
    352     case TOP_PATCHLIST_14:
    353     case TOP_PATCHLIST_15:
    354     case TOP_PATCHLIST_16:
    355     case TOP_PATCHLIST_17:
    356     case TOP_PATCHLIST_18:
    357     case TOP_PATCHLIST_19:
    358     case TOP_PATCHLIST_20:
    359     case TOP_PATCHLIST_21:
    360     case TOP_PATCHLIST_22:
    361     case TOP_PATCHLIST_23:
    362     case TOP_PATCHLIST_24:
    363     case TOP_PATCHLIST_25:
    364     case TOP_PATCHLIST_26:
    365     case TOP_PATCHLIST_27:
    366     case TOP_PATCHLIST_28:
    367     case TOP_PATCHLIST_29:
    368     case TOP_PATCHLIST_30:
    369     case TOP_PATCHLIST_31:
    370     case TOP_PATCHLIST_32:
    371         return numPrims * (mode - TOP_PATCHLIST_BASE);
    372 
    373     case TOP_POLYGON:
    374     case TOP_POINT_LIST_BF:
    375     case TOP_LINE_STRIP_CONT:
    376     case TOP_LINE_STRIP_BF:
    377     case TOP_LINE_STRIP_CONT_BF:
    378     case TOP_TRIANGLE_FAN_NOSTIPPLE:
    379     case TOP_TRI_STRIP_REVERSE:
    380     case TOP_PATCHLIST_BASE:
    381     case TOP_UNKNOWN:
    382         SWR_INVALID("Unsupported topology: %d", mode);
    383         return 0;
    384     }
    385 
    386     return 0;
    387 }
    388 
    389 //////////////////////////////////////////////////////////////////////////
    390 /// @brief Return number of verts per primitive.
    391 /// @param topology - topology
    392 /// @param includeAdjVerts - include adjacent verts in primitive vertices
    393 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
    394 {
    395     uint32_t numVerts = 0;
    396     switch (topology)
    397     {
    398     case TOP_POINT_LIST:
    399     case TOP_POINT_LIST_BF:
    400         numVerts = 1;
    401         break;
    402     case TOP_LINE_LIST:
    403     case TOP_LINE_STRIP:
    404     case TOP_LINE_LIST_ADJ:
    405     case TOP_LINE_LOOP:
    406     case TOP_LINE_STRIP_CONT:
    407     case TOP_LINE_STRIP_BF:
    408     case TOP_LISTSTRIP_ADJ:
    409         numVerts = 2;
    410         break;
    411     case TOP_TRIANGLE_LIST:
    412     case TOP_TRIANGLE_STRIP:
    413     case TOP_TRIANGLE_FAN:
    414     case TOP_TRI_LIST_ADJ:
    415     case TOP_TRI_STRIP_ADJ:
    416     case TOP_TRI_STRIP_REVERSE:
    417     case TOP_RECT_LIST:
    418         numVerts = 3;
    419         break;
    420     case TOP_QUAD_LIST:
    421     case TOP_QUAD_STRIP:
    422         numVerts = 4;
    423         break;
    424     case TOP_PATCHLIST_1:
    425     case TOP_PATCHLIST_2:
    426     case TOP_PATCHLIST_3:
    427     case TOP_PATCHLIST_4:
    428     case TOP_PATCHLIST_5:
    429     case TOP_PATCHLIST_6:
    430     case TOP_PATCHLIST_7:
    431     case TOP_PATCHLIST_8:
    432     case TOP_PATCHLIST_9:
    433     case TOP_PATCHLIST_10:
    434     case TOP_PATCHLIST_11:
    435     case TOP_PATCHLIST_12:
    436     case TOP_PATCHLIST_13:
    437     case TOP_PATCHLIST_14:
    438     case TOP_PATCHLIST_15:
    439     case TOP_PATCHLIST_16:
    440     case TOP_PATCHLIST_17:
    441     case TOP_PATCHLIST_18:
    442     case TOP_PATCHLIST_19:
    443     case TOP_PATCHLIST_20:
    444     case TOP_PATCHLIST_21:
    445     case TOP_PATCHLIST_22:
    446     case TOP_PATCHLIST_23:
    447     case TOP_PATCHLIST_24:
    448     case TOP_PATCHLIST_25:
    449     case TOP_PATCHLIST_26:
    450     case TOP_PATCHLIST_27:
    451     case TOP_PATCHLIST_28:
    452     case TOP_PATCHLIST_29:
    453     case TOP_PATCHLIST_30:
    454     case TOP_PATCHLIST_31:
    455     case TOP_PATCHLIST_32:
    456         numVerts = topology - TOP_PATCHLIST_BASE;
    457         break;
    458     default:
    459         SWR_INVALID("Unsupported topology: %d", topology);
    460         break;
    461     }
    462 
    463     if (includeAdjVerts)
    464     {
    465         switch (topology)
    466         {
    467         case TOP_LISTSTRIP_ADJ:
    468         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
    469         case TOP_TRI_STRIP_ADJ:
    470         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
    471         default: break;
    472         }
    473     }
    474 
    475     return numVerts;
    476 }
    477 
    478 //////////////////////////////////////////////////////////////////////////
    479 /// @brief Generate mask from remaining work.
    480 /// @param numWorkItems - Number of items being worked on by a SIMD.
    481 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
    482 {
    483     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
    484     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
    485     return _simd_castps_si(_simd_vmask_ps(mask));
    486 }
    487 
    488 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
    489 {
    490     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
    491     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
    492     return _simd16_castps_si(_simd16_vmask_ps(mask));
    493 }
    494 
    495 //////////////////////////////////////////////////////////////////////////
    496 /// @brief StreamOut - Streams vertex data out to SO buffers.
    497 ///        Generally, we are only streaming out a SIMDs worth of triangles.
    498 /// @param pDC - pointer to draw context.
    499 /// @param workerId - thread's worker id. Even thread has a unique id.
    500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
    501 static void StreamOut(
    502     DRAW_CONTEXT* pDC,
    503     PA_STATE& pa,
    504     uint32_t workerId,
    505     uint32_t* pPrimData,
    506     uint32_t streamIndex)
    507 {
    508     SWR_CONTEXT *pContext = pDC->pContext;
    509 
    510     AR_BEGIN(FEStreamout, pDC->drawId);
    511 
    512     const API_STATE& state = GetApiState(pDC);
    513     const SWR_STREAMOUT_STATE &soState = state.soState;
    514 
    515     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
    516 
    517     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
    518     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
    519 
    520     SWR_STREAMOUT_CONTEXT soContext = { 0 };
    521 
    522     // Setup buffer state pointers.
    523     for (uint32_t i = 0; i < 4; ++i)
    524     {
    525         soContext.pBuffer[i] = &state.soBuffer[i];
    526     }
    527 
    528     uint32_t numPrims = pa.NumPrims();
    529 
    530     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
    531     {
    532         DWORD slot = 0;
    533         uint32_t soMask = soState.streamMasks[streamIndex];
    534 
    535         // Write all entries into primitive data buffer for SOS.
    536         while (_BitScanForward(&slot, soMask))
    537         {
    538             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
    539             uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
    540             pa.AssembleSingle(paSlot, primIndex, attrib);
    541 
    542             // Attribute offset is relative offset from start of vertex.
    543             // Note that attributes start at slot 1 in the PA buffer. We need to write this
    544             // to prim data starting at slot 0. Which is why we do (slot - 1).
    545             // Also note: GL works slightly differently, and needs slot 0
    546             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
    547 
    548             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
    549             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
    550             {
    551                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
    552 
    553                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
    554             }
    555 
    556             soMask &= ~(1 << slot);
    557         }
    558 
    559         // Update pPrimData pointer
    560         soContext.pPrimData = pPrimData;
    561 
    562         // Call SOS
    563         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
    564         state.pfnSoFunc[streamIndex](soContext);
    565     }
    566 
    567     // Update SO write offset. The driver provides memory for the update.
    568     for (uint32_t i = 0; i < 4; ++i)
    569     {
    570         if (state.soBuffer[i].pWriteOffset)
    571         {
    572             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
    573         }
    574 
    575         if (state.soBuffer[i].soWriteEnable)
    576         {
    577             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
    578             pDC->dynState.SoWriteOffsetDirty[i] = true;
    579         }
    580     }
    581 
    582     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
    583     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
    584 
    585     AR_END(FEStreamout, 1);
    586 }
    587 
    588 #if USE_SIMD16_FRONTEND
    589 //////////////////////////////////////////////////////////////////////////
    590 /// Is value an even number (a multiple of two)
    591 ///
    592 template <typename T>
    593 INLINE static bool IsEven(T value)
    594 {
    595     return (value & 1) == 0;
    596 }
    597 
    598 //////////////////////////////////////////////////////////////////////////
    599 /// Round up value to an even number (a multiple of two)
    600 ///
    601 template <typename T>
    602 INLINE static T RoundUpEven(T value)
    603 {
    604     return (value + 1) & ~1;
    605 }
    606 
    607 //////////////////////////////////////////////////////////////////////////
    608 /// Round down value to an even number (a multiple of two)
    609 ///
    610 template <typename T>
    611 INLINE static T RoundDownEven(T value)
    612 {
    613     return value & ~1;
    614 }
    615 
    616 //////////////////////////////////////////////////////////////////////////
    617 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
    618 ///
    619 /// vertexCount is in terms of the source simdvertexes and must be even
    620 ///
    621 /// attribCount will limit the vector copies to those attribs specified
    622 ///
    623 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
    624 ///
    625 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
    626 {
    627     SWR_ASSERT(vertex);
    628     SWR_ASSERT(vertex_simd16);
    629     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
    630 
    631     simd16vertex temp;
    632 
    633     for (uint32_t i = 0; i < vertexCount; i += 2)
    634     {
    635         for (uint32_t j = 0; j < attribCount; j += 1)
    636         {
    637             for (uint32_t k = 0; k < 4; k += 1)
    638             {
    639                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
    640 
    641                 if ((i + 1) < vertexCount)
    642                 {
    643                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
    644                 }
    645             }
    646         }
    647 
    648         for (uint32_t j = 0; j < attribCount; j += 1)
    649         {
    650             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
    651         }
    652     }
    653 }
    654 
    655 #endif
    656 //////////////////////////////////////////////////////////////////////////
    657 /// @brief Computes number of invocations. The current index represents
    658 ///        the start of the SIMD. The max index represents how much work
    659 ///        items are remaining. If there is less then a SIMD's xmin of work
    660 ///        then return the remaining amount of work.
    661 /// @param curIndex - The start index for the SIMD.
    662 /// @param maxIndex - The last index for all work items.
    663 static INLINE uint32_t GetNumInvocations(
    664     uint32_t curIndex,
    665     uint32_t maxIndex)
    666 {
    667     uint32_t remainder = (maxIndex - curIndex);
    668 #if USE_SIMD16_FRONTEND
    669     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
    670 #else
    671     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
    672 #endif
    673 }
    674 
    675 //////////////////////////////////////////////////////////////////////////
    676 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
    677 ///        The geometry shader will loop over each active streamout buffer, assembling
    678 ///        primitives for the downstream stages. When multistream output is enabled,
    679 ///        the generated stream ID buffer from the GS needs to be converted to a cut
    680 ///        buffer for the primitive assembler.
    681 /// @param stream - stream id to generate the cut buffer for
    682 /// @param pStreamIdBase - pointer to the stream ID buffer
    683 /// @param numEmittedVerts - Number of total verts emitted by the GS
    684 /// @param pCutBuffer - output buffer to write cuts to
    685 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
    686 {
    687     SWR_ASSERT(stream < MAX_SO_STREAMS);
    688 
    689     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
    690     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
    691 
    692     for (uint32_t b = 0; b < numOutputBytes; ++b)
    693     {
    694         uint8_t curInputByte = pStreamIdBase[2*b];
    695         uint8_t outByte = 0;
    696         for (uint32_t i = 0; i < 4; ++i)
    697         {
    698             if ((curInputByte & 0x3) != stream)
    699             {
    700                 outByte |= (1 << i);
    701             }
    702             curInputByte >>= 2;
    703         }
    704 
    705         curInputByte = pStreamIdBase[2 * b + 1];
    706         for (uint32_t i = 0; i < 4; ++i)
    707         {
    708             if ((curInputByte & 0x3) != stream)
    709             {
    710                 outByte |= (1 << (i + 4));
    711             }
    712             curInputByte >>= 2;
    713         }
    714 
    715         *pCutBuffer++ = outByte;
    716     }
    717 }
    718 
    719 // Buffers that are allocated if GS is enabled
    720 struct GsBuffers
    721 {
    722     uint8_t* pGsIn;
    723     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
    724     uint8_t* pGsTransposed;
    725     void* pStreamCutBuffer;
    726 };
    727 
    728 //////////////////////////////////////////////////////////////////////////
    729 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
    730 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
    731 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
    732 /// @param numVerts - Number of vertices outputted by the GS
    733 /// @param numAttribs - Number of attributes per vertex
    734 template<typename SIMD_T, uint32_t SimdWidth>
    735 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
    736 {
    737     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
    738     uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
    739 
    740     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
    741 
    742     for (uint32_t i = 0; i < SimdWidth; ++i)
    743     {
    744         gatherOffsets[i] = srcVertexStride * i;
    745     }
    746     auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);
    747 
    748     uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
    749     uint32_t remainingVerts = numVerts;
    750 
    751     for (uint32_t s = 0; s < numSimd; ++s)
    752     {
    753         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
    754         uint8_t* pDstBase = pDst + s * dstVertexStride;
    755 
    756         // Compute mask to prevent src overflow
    757         uint32_t mask = std::min(remainingVerts, SimdWidth);
    758         mask = GenMask(mask);
    759         auto vMask = SIMD_T::vmask_ps(mask);
    760         auto viMask = SIMD_T::castps_si(vMask);
    761 
    762         for (uint32_t a = 0; a < numAttribs; ++a)
    763         {
    764             auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
    765             auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
    766             auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
    767             auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
    768 
    769             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
    770             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
    771             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
    772             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
    773 
    774             pSrcBase += sizeof(float) * 4;
    775             pDstBase += sizeof(typename SIMD_T::Float) * 4;
    776         }
    777         remainingVerts -= SimdWidth;
    778     }
    779 }
    780 
    781 
    782 //////////////////////////////////////////////////////////////////////////
    783 /// @brief Implements GS stage.
    784 /// @param pDC - pointer to draw context.
    785 /// @param workerId - thread's worker id. Even thread has a unique id.
    786 /// @param pa - The primitive assembly object.
    787 /// @param pGsOut - output stream for GS
    788 template <
    789     typename HasStreamOutT,
    790     typename HasRastT>
    791 static void GeometryShaderStage(
    792     DRAW_CONTEXT *pDC,
    793     uint32_t workerId,
    794     PA_STATE& pa,
    795     GsBuffers* pGsBuffers,
    796     uint32_t* pSoPrimData,
    797 #if USE_SIMD16_FRONTEND
    798     uint32_t numPrims_simd8,
    799 #endif
    800     simdscalari const &primID)
    801 {
    802     SWR_CONTEXT *pContext = pDC->pContext;
    803 
    804     AR_BEGIN(FEGeometryShader, pDC->drawId);
    805 
    806     const API_STATE& state = GetApiState(pDC);
    807     const SWR_GS_STATE* pState = &state.gsState;
    808     SWR_GS_CONTEXT gsContext;
    809 
    810     static uint8_t sNullBuffer[128] = { 0 };
    811 
    812     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
    813     {
    814         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
    815     }
    816     gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
    817     gsContext.PrimitiveID = primID;
    818 
    819     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
    820     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
    821 
    822     // assemble all attributes for the input primitive
    823     gsContext.inputVertStride = pState->inputVertStride;
    824     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
    825     {
    826         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
    827         uint32_t attribSlot = pState->vertexAttribOffset + slot;
    828         pa.Assemble(srcAttribSlot, attrib);
    829 
    830         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
    831         {
    832             gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
    833         }
    834     }
    835 
    836     // assemble position
    837     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
    838     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
    839     {
    840         gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
    841     }
    842 
    843     // record valid prims from the frontend to avoid over binning the newly generated
    844     // prims from the GS
    845 #if USE_SIMD16_FRONTEND
    846     uint32_t numInputPrims = numPrims_simd8;
    847 #else
    848     uint32_t numInputPrims = pa.NumPrims();
    849 #endif
    850 
    851     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
    852     {
    853         gsContext.InstanceID = instance;
    854         gsContext.mask = GenerateMask(numInputPrims);
    855 
    856         // execute the geometry shader
    857         state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
    858 
    859         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
    860         {
    861             gsContext.pStreams[i] += pState->allocationSize;
    862         }
    863     }
    864 
    865     // set up new binner and state for the GS output topology
    866 #if USE_SIMD16_FRONTEND
    867     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
    868     if (HasRastT::value)
    869     {
    870         switch (pState->outputTopology)
    871         {
    872         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
    873         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
    874         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
    875         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
    876         }
    877     }
    878 
    879 #else
    880     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
    881     if (HasRastT::value)
    882     {
    883         switch (pState->outputTopology)
    884         {
    885         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
    886         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
    887         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
    888         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
    889         }
    890     }
    891 
    892 #endif
    893     // foreach input prim:
    894     // - setup a new PA based on the emitted verts for that prim
    895     // - loop over the new verts, calling PA to assemble each prim
    896     uint32_t* pPrimitiveId = (uint32_t*)&primID;
    897 
    898     uint32_t totalPrimsGenerated = 0;
    899     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
    900     {
    901         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
    902 
    903         // Vertex count is either emitted by shader or static
    904         uint32_t vertexCount = 0;
    905         if (pState->staticVertexCount)
    906         {
    907             vertexCount = pState->staticVertexCount;
    908         }
    909         else
    910         {
    911             // If emitted in shader, it should be the stored in the first dword of the output buffer
    912             vertexCount = *(uint32_t*)pInstanceBase;
    913         }
    914 
    915         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
    916         {
    917             uint32_t numEmittedVerts = vertexCount;
    918             if (numEmittedVerts == 0)
    919             {
    920                 continue;
    921             }
    922 
    923             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
    924             uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
    925             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
    926 
    927 #if USE_SIMD16_FRONTEND
    928             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
    929 #else
    930             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
    931 #endif
    932 
    933             uint32_t numAttribs = state.feNumAttributes;
    934 
    935             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
    936             {
    937                 bool processCutVerts = false;
    938                 uint8_t* pCutBuffer = pCutBase;
    939 
    940                 // assign default stream ID, only relevant when GS is outputting a single stream
    941                 uint32_t streamID = 0;
    942                 if (pState->isSingleStream)
    943                 {
    944                     processCutVerts = true;
    945                     streamID = pState->singleStreamID;
    946                     if (streamID != stream) continue;
    947                 }
    948                 else
    949                 {
    950                     // early exit if this stream is not enabled for streamout
    951                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
    952                     {
    953                         continue;
    954                     }
    955 
    956                     // multi-stream output, need to translate StreamID buffer to a cut buffer
    957                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
    958                     pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
    959                     processCutVerts = false;
    960                 }
    961 
    962 #if USE_SIMD16_FRONTEND
    963                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
    964 
    965 #else
    966                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
    967 
    968 #endif
    969                 while (gsPa.GetNextStreamOutput())
    970                 {
    971                     do
    972                     {
    973 #if USE_SIMD16_FRONTEND
    974                         simd16vector attrib_simd16[3];
    975 
    976                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
    977 
    978 #else
    979                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
    980 
    981 #endif
    982                         if (assemble)
    983                         {
    984                             totalPrimsGenerated += gsPa.NumPrims();
    985 
    986                             if (HasStreamOutT::value)
    987                             {
    988 #if ENABLE_AVX512_SIMD16
    989                                 gsPa.useAlternateOffset = false;
    990 #endif
    991                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
    992                             }
    993 
    994                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
    995                             {
    996 #if USE_SIMD16_FRONTEND
    997                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
    998 
    999                                 // Gather data from the SVG if provided.
   1000                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
   1001                                 simd16scalari vRtIdx = SIMD16::setzero_si();
   1002                                 SIMD16::Vec4 svgAttrib[4];
   1003 
   1004                                 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
   1005                                 {
   1006                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
   1007                                 }
   1008 
   1009 
   1010                                 if (state.backendState.readViewportArrayIndex)
   1011                                 {
   1012                                     vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
   1013                                     gsPa.viewportArrayActive = true;
   1014                                 }
   1015                                 if (state.backendState.readRenderTargetArrayIndex)
   1016                                 {
   1017                                     vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
   1018                                     gsPa.rtArrayActive = true;
   1019                                 }
   1020 
   1021                                 {
   1022                                     // OOB VPAI indices => forced to zero.
   1023                                     vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
   1024                                     simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
   1025                                     simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
   1026                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
   1027 
   1028                                     gsPa.useAlternateOffset = false;
   1029                                     pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
   1030                                 }
   1031 #else
   1032                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
   1033 
   1034                                 // Gather data from the SVG if provided.
   1035                                 simdscalari vViewportIdx = SIMD16::setzero_si();
   1036                                 simdscalari vRtIdx = SIMD16::setzero_si();
   1037                                 SIMD8::Vec4 svgAttrib[4];
   1038 
   1039                                 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
   1040                                 {
   1041                                     tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
   1042                                 }
   1043 
   1044 
   1045                                 if (state.backendState.readViewportArrayIndex)
   1046                                 {
   1047                                     vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
   1048 
   1049                                     // OOB VPAI indices => forced to zero.
   1050                                     vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
   1051                                     simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
   1052                                     simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
   1053                                     vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
   1054                                     tessPa.viewportArrayActive = true;
   1055                                 }
   1056                                 if (state.backendState.readRenderTargetArrayIndex)
   1057                                 {
   1058                                     vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
   1059                                     tessPa.rtArrayActive = true;
   1060                                 }
   1061 
   1062                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
   1063 #endif
   1064                             }
   1065                         }
   1066                     } while (gsPa.NextPrim());
   1067                 }
   1068             }
   1069         }
   1070     }
   1071 
   1072     // update GS pipeline stats
   1073     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
   1074     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
   1075     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
   1076     AR_END(FEGeometryShader, 1);
   1077 }
   1078 
   1079 //////////////////////////////////////////////////////////////////////////
   1080 /// @brief Allocate GS buffers
   1081 /// @param pDC - pointer to draw context.
   1082 /// @param state - API state
   1083 /// @param ppGsOut - pointer to GS output buffer allocation
   1084 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
   1085 template<typename SIMD_T, uint32_t SIMD_WIDTH>
   1086 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
   1087 {
   1088     auto pArena = pDC->pArena;
   1089     SWR_ASSERT(pArena != nullptr);
   1090     SWR_ASSERT(state.gsState.gsEnable);
   1091 
   1092     const SWR_GS_STATE& gsState = state.gsState;
   1093 
   1094     // Allocate storage for vertex inputs
   1095     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
   1096     pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
   1097 
   1098     // Allocate arena space to hold GS output verts
   1099     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
   1100 
   1101     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
   1102     {
   1103         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
   1104     }
   1105 
   1106     // Allocate storage for transposed GS output
   1107     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
   1108     uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
   1109     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
   1110 
   1111     // Allocate storage to hold temporary stream->cut buffer, if necessary
   1112     if (state.gsState.isSingleStream)
   1113     {
   1114         pGsBuffers->pStreamCutBuffer = nullptr;
   1115     }
   1116     else
   1117     {
   1118         pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
   1119     }
   1120 }
   1121 
   1122 //////////////////////////////////////////////////////////////////////////
   1123 /// @brief Contains all data generated by the HS and passed to the
   1124 /// tessellator and DS.
   1125 struct TessellationThreadLocalData
   1126 {
   1127     SWR_HS_CONTEXT hsContext;
   1128     ScalarPatch patchData[KNOB_SIMD_WIDTH];
   1129     void* pTxCtx;
   1130     size_t tsCtxSize;
   1131 
   1132     simdscalar* pDSOutput;
   1133     size_t dsOutputAllocSize;
   1134 };
   1135 
   1136 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
   1137 
   1138 //////////////////////////////////////////////////////////////////////////
   1139 /// @brief Allocate tessellation data for this worker thread.
   1140 INLINE
   1141 static void AllocateTessellationData(SWR_CONTEXT* pContext)
   1142 {
   1143     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
   1144     if (gt_pTessellationThreadData == nullptr)
   1145     {
   1146         gt_pTessellationThreadData = (TessellationThreadLocalData*)
   1147             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
   1148         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
   1149     }
   1150 }
   1151 
   1152 //////////////////////////////////////////////////////////////////////////
   1153 /// @brief Implements Tessellation Stages.
   1154 /// @param pDC - pointer to draw context.
   1155 /// @param workerId - thread's worker id. Even thread has a unique id.
   1156 /// @param pa - The primitive assembly object.
   1157 /// @param pGsOut - output stream for GS
   1158 template <
   1159     typename HasGeometryShaderT,
   1160     typename HasStreamOutT,
   1161     typename HasRastT>
   1162 static void TessellationStages(
   1163     DRAW_CONTEXT *pDC,
   1164     uint32_t workerId,
   1165     PA_STATE& pa,
   1166     GsBuffers* pGsBuffers,
   1167     uint32_t* pSoPrimData,
   1168 #if USE_SIMD16_FRONTEND
   1169     uint32_t numPrims_simd8,
   1170 #endif
   1171     simdscalari const &primID)
   1172 {
   1173     SWR_CONTEXT *pContext = pDC->pContext;
   1174     const API_STATE& state = GetApiState(pDC);
   1175     const SWR_TS_STATE& tsState = state.tsState;
   1176 
   1177     SWR_ASSERT(gt_pTessellationThreadData);
   1178 
   1179     HANDLE tsCtx = TSInitCtx(
   1180         tsState.domain,
   1181         tsState.partitioning,
   1182         tsState.tsOutputTopology,
   1183         gt_pTessellationThreadData->pTxCtx,
   1184         gt_pTessellationThreadData->tsCtxSize);
   1185     if (tsCtx == nullptr)
   1186     {
   1187         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
   1188         tsCtx = TSInitCtx(
   1189             tsState.domain,
   1190             tsState.partitioning,
   1191             tsState.tsOutputTopology,
   1192             gt_pTessellationThreadData->pTxCtx,
   1193             gt_pTessellationThreadData->tsCtxSize);
   1194     }
   1195     SWR_ASSERT(tsCtx);
   1196 
   1197 #if USE_SIMD16_FRONTEND
   1198     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
   1199     if (HasRastT::value)
   1200     {
   1201         switch (tsState.postDSTopology)
   1202         {
   1203         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
   1204         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
   1205         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
   1206         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
   1207         }
   1208     }
   1209 
   1210 #else
   1211     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
   1212     if (HasRastT::value)
   1213     {
   1214         switch (tsState.postDSTopology)
   1215         {
   1216         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
   1217         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
   1218         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
   1219         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
   1220         }
   1221     }
   1222 
   1223 #endif
   1224     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
   1225     hsContext.pCPout = gt_pTessellationThreadData->patchData;
   1226     hsContext.PrimitiveID = primID;
   1227 
   1228     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
   1229     // Max storage for one attribute for an entire simdprimitive
   1230     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
   1231 
   1232     // assemble all attributes for the input primitives
   1233     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
   1234     {
   1235         uint32_t attribSlot = tsState.vertexAttribOffset + slot;
   1236         pa.Assemble(attribSlot, simdattrib);
   1237 
   1238         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
   1239         {
   1240             hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
   1241         }
   1242     }
   1243 
   1244 #if defined(_DEBUG)
   1245     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
   1246 #endif
   1247 
   1248 #if USE_SIMD16_FRONTEND
   1249     uint32_t numPrims = numPrims_simd8;
   1250 #else
   1251     uint32_t numPrims = pa.NumPrims();
   1252 #endif
   1253     hsContext.mask = GenerateMask(numPrims);
   1254 
   1255     // Run the HS
   1256     AR_BEGIN(FEHullShader, pDC->drawId);
   1257     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
   1258     AR_END(FEHullShader, 0);
   1259 
   1260     UPDATE_STAT_FE(HsInvocations, numPrims);
   1261 
   1262     const uint32_t* pPrimId = (const uint32_t*)&primID;
   1263 
   1264     for (uint32_t p = 0; p < numPrims; ++p)
   1265     {
   1266         // Run Tessellator
   1267         SWR_TS_TESSELLATED_DATA tsData = { 0 };
   1268         AR_BEGIN(FETessellation, pDC->drawId);
   1269         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
   1270         AR_EVENT(TessPrimCount(1));
   1271         AR_END(FETessellation, 0);
   1272 
   1273         if (tsData.NumPrimitives == 0)
   1274         {
   1275             continue;
   1276         }
   1277         SWR_ASSERT(tsData.NumDomainPoints);
   1278 
   1279         // Allocate DS Output memory
   1280         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
   1281 #if USE_SIMD16_FRONTEND
   1282         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;      // simd8 -> simd16, padding
   1283 #else
   1284         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
   1285         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
   1286 #endif
   1287         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
   1288         {
   1289             AlignedFree(gt_pTessellationThreadData->pDSOutput);
   1290             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
   1291             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
   1292         }
   1293         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
   1294         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
   1295 
   1296 #if defined(_DEBUG)
   1297         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
   1298 #endif
   1299 
   1300         // Run Domain Shader
   1301         SWR_DS_CONTEXT dsContext;
   1302         dsContext.PrimitiveID = pPrimId[p];
   1303         dsContext.pCpIn = &hsContext.pCPout[p];
   1304         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
   1305         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
   1306         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
   1307         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
   1308 #if USE_SIMD16_FRONTEND
   1309         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
   1310 #else
   1311         dsContext.vectorStride = requiredDSVectorInvocations;
   1312 #endif
   1313 
   1314         uint32_t dsInvocations = 0;
   1315 
   1316         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
   1317         {
   1318             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
   1319 
   1320             AR_BEGIN(FEDomainShader, pDC->drawId);
   1321             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
   1322             AR_END(FEDomainShader, 0);
   1323 
   1324             dsInvocations += KNOB_SIMD_WIDTH;
   1325         }
   1326         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
   1327 
   1328 #if USE_SIMD16_FRONTEND
   1329         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
   1330 
   1331 #endif
   1332         PA_TESS tessPa(
   1333             pDC,
   1334 #if USE_SIMD16_FRONTEND
   1335             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
   1336             dsContext.vectorStride / 2,                                         // simd8 -> simd16
   1337 #else
   1338             dsContext.pOutputData,
   1339             dsContext.vectorStride,
   1340 #endif
   1341             SWR_VTX_NUM_SLOTS,
   1342             tsState.numDsOutputAttribs,
   1343             tsData.ppIndices,
   1344             tsData.NumPrimitives,
   1345             tsState.postDSTopology,
   1346             numVertsPerPrim);
   1347 
   1348         while (tessPa.HasWork())
   1349         {
   1350 #if USE_SIMD16_FRONTEND
   1351             const uint32_t numPrims = tessPa.NumPrims();
   1352             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
   1353             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
   1354 
   1355             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
   1356             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
   1357             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
   1358 
   1359 #endif
   1360             if (HasGeometryShaderT::value)
   1361             {
   1362 #if USE_SIMD16_FRONTEND
   1363                 tessPa.useAlternateOffset = false;
   1364                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
   1365 
   1366                 if (numPrims_hi)
   1367                 {
   1368                     tessPa.useAlternateOffset = true;
   1369                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
   1370                 }
   1371 #else
   1372                 GeometryShaderStage<HasStreamOutT, HasRastT>(
   1373                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
   1374 #endif
   1375             }
   1376             else
   1377             {
   1378                 if (HasStreamOutT::value)
   1379                 {
   1380 #if ENABLE_AVX512_SIMD16
   1381                     tessPa.useAlternateOffset = false;
   1382 #endif
   1383                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
   1384                 }
   1385 
   1386                 if (HasRastT::value)
   1387                 {
   1388 #if USE_SIMD16_FRONTEND
   1389                     simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
   1390 #else
   1391                     simdvector      prim[3];        // Only deal with triangles, lines, or points
   1392 #endif
   1393                     AR_BEGIN(FEPAAssemble, pDC->drawId);
   1394                     bool assemble =
   1395 #if USE_SIMD16_FRONTEND
   1396                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
   1397 #else
   1398                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
   1399 #endif
   1400                     AR_END(FEPAAssemble, 1);
   1401                     SWR_ASSERT(assemble);
   1402 
   1403                     SWR_ASSERT(pfnClipFunc);
   1404 #if USE_SIMD16_FRONTEND
   1405                     // Gather data from the SVG if provided.
   1406                     simd16scalari vViewportIdx = SIMD16::setzero_si();
   1407                     simd16scalari vRtIdx = SIMD16::setzero_si();
   1408                     SIMD16::Vec4 svgAttrib[4];
   1409 
   1410                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
   1411                     {
   1412                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
   1413                     }
   1414 
   1415 
   1416                     if (state.backendState.readViewportArrayIndex)
   1417                     {
   1418                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
   1419                         tessPa.viewportArrayActive = true;
   1420                     }
   1421                     if (state.backendState.readRenderTargetArrayIndex)
   1422                     {
   1423                         vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
   1424                         tessPa.rtArrayActive = true;
   1425                     }
   1426 
   1427 
   1428                     {
   1429                         // OOB VPAI indices => forced to zero.
   1430                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
   1431                         simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
   1432                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
   1433                         vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
   1434 
   1435                         tessPa.useAlternateOffset = false;
   1436                         pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx);
   1437                     }
   1438 #else
   1439                     // Gather data from the SVG if provided.
   1440                     simdscalari vViewportIdx = SIMD16::setzero_si();
   1441                     simdscalari vRtIdx = SIMD16::setzero_si();
   1442                     SIMD8::Vec4 svgAttrib[4];
   1443 
   1444                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
   1445                     {
   1446                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
   1447                     }
   1448 
   1449                     if (state.backendState.readViewportArrayIndex)
   1450                     {
   1451                         vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
   1452 
   1453                         // OOB VPAI indices => forced to zero.
   1454                         vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
   1455                         simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
   1456                         simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
   1457                         vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
   1458                         tessPa.viewportArrayActive = true;
   1459                     }
   1460                     if (state.backendState.readRenderTargetArrayIndex)
   1461                     {
   1462                         vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
   1463                         tessPa.rtArrayActive = true;
   1464                     }
   1465                     pfnClipFunc(pDC, tessPa, workerId, prim,
   1466                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx);
   1467 #endif
   1468                 }
   1469             }
   1470 
   1471             tessPa.NextPrim();
   1472 
   1473         } // while (tessPa.HasWork())
   1474     } // for (uint32_t p = 0; p < numPrims; ++p)
   1475 
   1476 #if USE_SIMD16_FRONTEND
   1477     if (gt_pTessellationThreadData->pDSOutput != nullptr)
   1478     {
   1479         AlignedFree(gt_pTessellationThreadData->pDSOutput);
   1480         gt_pTessellationThreadData->pDSOutput = nullptr;
   1481     }
   1482     gt_pTessellationThreadData->dsOutputAllocSize = 0;
   1483 
   1484 #endif
   1485     TSDestroyCtx(tsCtx);
   1486 }
   1487 
   1488 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
   1489 THREAD uint32_t gVertexStoreSize = 0;
   1490 
   1491 //////////////////////////////////////////////////////////////////////////
   1492 /// @brief FE handler for SwrDraw.
   1493 /// @tparam IsIndexedT - Is indexed drawing enabled
   1494 /// @tparam HasTessellationT - Is tessellation enabled
   1495 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
   1496 /// @tparam HasStreamOutT - Is stream-out enabled
   1497 /// @tparam HasRastT - Is rasterization enabled
   1498 /// @param pContext - pointer to SWR context.
   1499 /// @param pDC - pointer to draw context.
   1500 /// @param workerId - thread's worker id.
   1501 /// @param pUserData - Pointer to DRAW_WORK
   1502 template <
   1503     typename IsIndexedT,
   1504     typename IsCutIndexEnabledT,
   1505     typename HasTessellationT,
   1506     typename HasGeometryShaderT,
   1507     typename HasStreamOutT,
   1508     typename HasRastT>
   1509 void ProcessDraw(
   1510     SWR_CONTEXT *pContext,
   1511     DRAW_CONTEXT *pDC,
   1512     uint32_t workerId,
   1513     void *pUserData)
   1514 {
   1515 
   1516 #if KNOB_ENABLE_TOSS_POINTS
   1517     if (KNOB_TOSS_QUEUE_FE)
   1518     {
   1519         return;
   1520     }
   1521 #endif
   1522 
   1523     AR_BEGIN(FEProcessDraw, pDC->drawId);
   1524 
   1525     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
   1526     const API_STATE&    state = GetApiState(pDC);
   1527 
   1528     uint32_t indexSize = 0;
   1529     uint32_t endVertex = work.numVerts;
   1530 
   1531     const int32_t* pLastRequestedIndex = nullptr;
   1532     if (IsIndexedT::value)
   1533     {
   1534         switch (work.type)
   1535         {
   1536         case R32_UINT:
   1537             indexSize = sizeof(uint32_t);
   1538             pLastRequestedIndex = &(work.pIB[endVertex]);
   1539             break;
   1540         case R16_UINT:
   1541             indexSize = sizeof(uint16_t);
   1542             // nasty address offset to last index
   1543             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
   1544             break;
   1545         case R8_UINT:
   1546             indexSize = sizeof(uint8_t);
   1547             // nasty address offset to last index
   1548             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
   1549             break;
   1550         default:
   1551             SWR_INVALID("Invalid work.type: %d", work.type);
   1552         }
   1553     }
   1554     else
   1555     {
   1556         // No cuts, prune partial primitives.
   1557         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
   1558     }
   1559 
   1560 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
   1561     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
   1562 #endif
   1563 
   1564     GsBuffers gsBuffers;
   1565     if (HasGeometryShaderT::value)
   1566     {
   1567 #if USE_SIMD16_FRONTEND
   1568         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
   1569 #else
   1570         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
   1571 #endif
   1572     }
   1573 
   1574     if (HasTessellationT::value)
   1575     {
   1576         SWR_ASSERT(state.tsState.tsEnable == true);
   1577         SWR_ASSERT(state.pfnHsFunc != nullptr);
   1578         SWR_ASSERT(state.pfnDsFunc != nullptr);
   1579 
   1580         AllocateTessellationData(pContext);
   1581     }
   1582     else
   1583     {
   1584         SWR_ASSERT(state.tsState.tsEnable == false);
   1585         SWR_ASSERT(state.pfnHsFunc == nullptr);
   1586         SWR_ASSERT(state.pfnDsFunc == nullptr);
   1587     }
   1588 
   1589     // allocate space for streamout input prim data
   1590     uint32_t* pSoPrimData = nullptr;
   1591     if (HasStreamOutT::value)
   1592     {
   1593         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
   1594     }
   1595 
   1596     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
   1597 #if USE_SIMD16_FRONTEND
   1598     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
   1599 #else
   1600     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
   1601 #endif
   1602 
   1603     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
   1604 
   1605     // Compute storage requirements for vertex store
   1606     // TODO: allocation needs to be rethought for better cut support
   1607     uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
   1608     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
   1609 
   1610     // grow the vertex store for the PA as necessary
   1611     if (gVertexStoreSize < vertexStoreSize)
   1612     {
   1613         if (gpVertexStore != nullptr)
   1614         {
   1615             AlignedFree(gpVertexStore);
   1616             gpVertexStore = nullptr;
   1617         }
   1618 
   1619         SWR_ASSERT(gpVertexStore == nullptr);
   1620 
   1621         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
   1622         gVertexStoreSize = vertexStoreSize;
   1623 
   1624         SWR_ASSERT(gpVertexStore != nullptr);
   1625     }
   1626 
   1627     // choose primitive assembler
   1628 
   1629     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
   1630     PA_STATE& pa = paFactory.GetPA();
   1631 
   1632 #if USE_SIMD16_FRONTEND
   1633 #if USE_SIMD16_SHADERS
   1634     simd16vertex        vin;
   1635 #else
   1636     simdvertex          vin_lo;
   1637     simdvertex          vin_hi;
   1638 #endif
   1639     SWR_VS_CONTEXT      vsContext_lo;
   1640     SWR_VS_CONTEXT      vsContext_hi;
   1641 
   1642 #if USE_SIMD16_SHADERS
   1643     vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
   1644     vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
   1645 #else
   1646     vsContext_lo.pVin = &vin_lo;
   1647     vsContext_hi.pVin = &vin_hi;
   1648 #endif
   1649     vsContext_lo.AlternateOffset = 0;
   1650     vsContext_hi.AlternateOffset = 1;
   1651 
   1652     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
   1653 
   1654     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
   1655     fetchInfo_lo.StartInstance = work.startInstance;
   1656     fetchInfo_lo.StartVertex = 0;
   1657 
   1658     if (IsIndexedT::value)
   1659     {
   1660         fetchInfo_lo.BaseVertex = work.baseVertex;
   1661 
   1662         // if the entire index buffer isn't being consumed, set the last index
   1663         // so that fetches < a SIMD wide will be masked off
   1664         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
   1665         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
   1666         {
   1667             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
   1668         }
   1669     }
   1670     else
   1671     {
   1672         fetchInfo_lo.StartVertex = work.startVertex;
   1673     }
   1674 
   1675     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
   1676 
   1677     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   1678 
   1679     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
   1680     {
   1681         uint32_t  i = 0;
   1682 
   1683         simd16scalari vIndex;
   1684 
   1685         if (IsIndexedT::value)
   1686         {
   1687             fetchInfo_lo.pIndices = work.pIB;
   1688             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
   1689         }
   1690         else
   1691         {
   1692             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
   1693 
   1694             fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
   1695             fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
   1696         }
   1697 
   1698         fetchInfo_lo.CurInstance = instanceNum;
   1699         fetchInfo_hi.CurInstance = instanceNum;
   1700 
   1701         vsContext_lo.InstanceID = instanceNum;
   1702         vsContext_hi.InstanceID = instanceNum;
   1703 
   1704         while (pa.HasWork())
   1705         {
   1706             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
   1707             // So we need to keep this outside of (i < endVertex) check.
   1708 
   1709             simdmask *pvCutIndices_lo = nullptr;
   1710             simdmask *pvCutIndices_hi = nullptr;
   1711 
   1712             if (IsIndexedT::value)
   1713             {
   1714                 // simd16mask <=> simdmask[2]
   1715 
   1716                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
   1717                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
   1718             }
   1719 
   1720             simd16vertex &vout = pa.GetNextVsOutput();
   1721 
   1722             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
   1723             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
   1724 
   1725             if (i < endVertex)
   1726             {
   1727                 if (!IsIndexedT::value)
   1728                 {
   1729                     fetchInfo_lo.pLastIndex = fetchInfo_lo.pIndices;
   1730                     uint32_t offset;
   1731                     offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH);
   1732 #if USE_SIMD16_SHADERS
   1733                     fetchInfo_lo.pLastIndex += offset;
   1734 #else
   1735                     fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH);
   1736                     uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
   1737                     assert(offset >= 0);
   1738                     fetchInfo_hi.pLastIndex = fetchInfo_hi.pIndices;
   1739                     fetchInfo_hi.pLastIndex += offset2;
   1740 #endif
   1741                 }
   1742                 // 1. Execute FS/VS for a single SIMD.
   1743                 AR_BEGIN(FEFetchShader, pDC->drawId);
   1744 #if USE_SIMD16_SHADERS
   1745                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
   1746 #else
   1747                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo);
   1748 
   1749                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
   1750                 {
   1751                     state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
   1752                 }
   1753 #endif
   1754                 AR_END(FEFetchShader, 0);
   1755 
   1756                 // forward fetch generated vertex IDs to the vertex shader
   1757 #if USE_SIMD16_SHADERS
   1758 #if USE_SIMD16_VS
   1759                 vsContext_lo.VertexID16 = _simd16_insert_si(
   1760                     vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
   1761                 vsContext_lo.VertexID16 = _simd16_insert_si(
   1762                     vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
   1763 #else
   1764                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
   1765                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
   1766 #endif
   1767 #else
   1768                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
   1769                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
   1770 #endif
   1771 
   1772                 // Setup active mask for vertex shader.
   1773 #if USE_SIMD16_VS
   1774                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
   1775 #else
   1776                 vsContext_lo.mask = GenerateMask(endVertex - i);
   1777                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
   1778 #endif
   1779 
   1780                 // forward cut mask to the PA
   1781                 if (IsIndexedT::value)
   1782                 {
   1783 #if USE_SIMD16_SHADERS
   1784                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
   1785                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
   1786 #else
   1787                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
   1788                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
   1789 #endif
   1790                 }
   1791 
   1792                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
   1793 
   1794 #if KNOB_ENABLE_TOSS_POINTS
   1795                 if (!KNOB_TOSS_FETCH)
   1796 #endif
   1797                 {
   1798                     AR_BEGIN(FEVertexShader, pDC->drawId);
   1799 #if USE_SIMD16_VS
   1800                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
   1801 #else
   1802                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
   1803 
   1804                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
   1805                     {
   1806                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
   1807                     }
   1808 #endif
   1809                     AR_END(FEVertexShader, 0);
   1810 
   1811                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
   1812                 }
   1813             }
   1814 
   1815             // 2. Assemble primitives given the last two SIMD.
   1816             do
   1817             {
   1818                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
   1819 
   1820                 RDTSC_START(FEPAAssemble);
   1821                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
   1822                 RDTSC_STOP(FEPAAssemble, 1, 0);
   1823 
   1824 #if KNOB_ENABLE_TOSS_POINTS
   1825                 if (!KNOB_TOSS_FETCH)
   1826 #endif
   1827                 {
   1828 #if KNOB_ENABLE_TOSS_POINTS
   1829                     if (!KNOB_TOSS_VS)
   1830 #endif
   1831                     {
   1832                         if (assemble)
   1833                         {
   1834                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
   1835 
   1836                             const uint32_t numPrims = pa.NumPrims();
   1837                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
   1838                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
   1839 
   1840                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
   1841                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
   1842                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
   1843 
   1844                             if (HasTessellationT::value)
   1845                             {
   1846                                 pa.useAlternateOffset = false;
   1847                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
   1848 
   1849                                 if (numPrims_hi)
   1850                                 {
   1851                                     pa.useAlternateOffset = true;
   1852                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
   1853                                 }
   1854                             }
   1855                             else if (HasGeometryShaderT::value)
   1856                             {
   1857                                 pa.useAlternateOffset = false;
   1858                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
   1859 
   1860                                 if (numPrims_hi)
   1861                                 {
   1862                                     pa.useAlternateOffset = true;
   1863                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
   1864                                 }
   1865                             }
   1866                             else
   1867                             {
   1868                                 // If streamout is enabled then stream vertices out to memory.
   1869                                 if (HasStreamOutT::value)
   1870                                 {
   1871                                     pa.useAlternateOffset = false;
   1872                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
   1873                                 }
   1874 
   1875                                 if (HasRastT::value)
   1876                                 {
   1877                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
   1878                                     // Gather data from the SVG if provided.
   1879                                     simd16scalari vpai = SIMD16::setzero_si();
   1880                                     simd16scalari rtai = SIMD16::setzero_si();
   1881                                     SIMD16::Vec4 svgAttrib[4];
   1882 
   1883                                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
   1884                                     {
   1885                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
   1886                                     }
   1887 
   1888 
   1889                                     if (state.backendState.readViewportArrayIndex)
   1890                                     {
   1891                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
   1892                                         pa.viewportArrayActive = true;
   1893                                     }
   1894                                     if (state.backendState.readRenderTargetArrayIndex)
   1895                                     {
   1896                                         rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
   1897                                         pa.rtArrayActive = true;
   1898                                     }
   1899 
   1900                                     {
   1901                                         // OOB VPAI indices => forced to zero.
   1902                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
   1903                                         simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
   1904                                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports);
   1905                                         vpai = SIMD16::and_si(vClearMask, vpai);
   1906 
   1907                                         pa.useAlternateOffset = false;
   1908                                         pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai);
   1909                                     }
   1910                                 }
   1911                             }
   1912                         }
   1913                     }
   1914                 }
   1915             } while (pa.NextPrim());
   1916 
   1917             if (IsIndexedT::value)
   1918             {
   1919                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
   1920                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
   1921             }
   1922             else
   1923             {
   1924                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
   1925             }
   1926 
   1927             i += KNOB_SIMD16_WIDTH;
   1928         }
   1929 
   1930         pa.Reset();
   1931     }
   1932 
   1933 #else
   1934     SWR_VS_CONTEXT      vsContext;
   1935     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
   1936 
   1937     fetchInfo.pStreams = &state.vertexBuffers[0];
   1938     fetchInfo.StartInstance = work.startInstance;
   1939     fetchInfo.StartVertex = 0;
   1940 
   1941     if (IsIndexedT::value)
   1942     {
   1943         fetchInfo.BaseVertex = work.baseVertex;
   1944 
   1945         // if the entire index buffer isn't being consumed, set the last index
   1946         // so that fetches < a SIMD wide will be masked off
   1947         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
   1948         if (pLastRequestedIndex < fetchInfo.pLastIndex)
   1949         {
   1950             fetchInfo.pLastIndex = pLastRequestedIndex;
   1951         }
   1952     }
   1953     else
   1954     {
   1955         fetchInfo.StartVertex = work.startVertex;
   1956     }
   1957 
   1958     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   1959 
   1960     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
   1961     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
   1962     {
   1963         simdscalari vIndex;
   1964         uint32_t  i = 0;
   1965 
   1966         if (IsIndexedT::value)
   1967         {
   1968             fetchInfo.pIndices = work.pIB;
   1969         }
   1970         else
   1971         {
   1972             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
   1973             fetchInfo.pIndices = (const int32_t*)&vIndex;
   1974         }
   1975 
   1976         fetchInfo.CurInstance = instanceNum;
   1977         vsContext.InstanceID = instanceNum;
   1978 
   1979         while (pa.HasWork())
   1980         {
   1981             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
   1982             // So we need to keep this outside of (i < endVertex) check.
   1983             simdmask* pvCutIndices = nullptr;
   1984             if (IsIndexedT::value)
   1985             {
   1986                 pvCutIndices = &pa.GetNextVsIndices();
   1987             }
   1988 
   1989             simdvertex& vout = pa.GetNextVsOutput();
   1990             vsContext.pVin = &vout;
   1991             vsContext.pVout = &vout;
   1992 
   1993             if (i < endVertex)
   1994             {
   1995 
   1996                 // 1. Execute FS/VS for a single SIMD.
   1997                 AR_BEGIN(FEFetchShader, pDC->drawId);
   1998                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
   1999                 AR_END(FEFetchShader, 0);
   2000 
   2001                 // forward fetch generated vertex IDs to the vertex shader
   2002                 vsContext.VertexID = fetchInfo.VertexID;
   2003 
   2004                 // Setup active mask for vertex shader.
   2005                 vsContext.mask = GenerateMask(endVertex - i);
   2006 
   2007                 // forward cut mask to the PA
   2008                 if (IsIndexedT::value)
   2009                 {
   2010                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
   2011                 }
   2012 
   2013                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
   2014 
   2015 #if KNOB_ENABLE_TOSS_POINTS
   2016                 if (!KNOB_TOSS_FETCH)
   2017 #endif
   2018                 {
   2019                     AR_BEGIN(FEVertexShader, pDC->drawId);
   2020                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
   2021                     AR_END(FEVertexShader, 0);
   2022 
   2023                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
   2024                 }
   2025             }
   2026 
   2027             // 2. Assemble primitives given the last two SIMD.
   2028             do
   2029             {
   2030                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
   2031                 // PaAssemble returns false if there is not enough verts to assemble.
   2032                 AR_BEGIN(FEPAAssemble, pDC->drawId);
   2033                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
   2034                 AR_END(FEPAAssemble, 1);
   2035 
   2036 #if KNOB_ENABLE_TOSS_POINTS
   2037                 if (!KNOB_TOSS_FETCH)
   2038 #endif
   2039                 {
   2040 #if KNOB_ENABLE_TOSS_POINTS
   2041                     if (!KNOB_TOSS_VS)
   2042 #endif
   2043                     {
   2044                         if (assemble)
   2045                         {
   2046                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
   2047 
   2048                             if (HasTessellationT::value)
   2049                             {
   2050                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
   2051                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
   2052                             }
   2053                             else if (HasGeometryShaderT::value)
   2054                             {
   2055                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
   2056                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
   2057                             }
   2058                             else
   2059                             {
   2060                                 // If streamout is enabled then stream vertices out to memory.
   2061                                 if (HasStreamOutT::value)
   2062                                 {
   2063                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
   2064                                 }
   2065 
   2066                                 if (HasRastT::value)
   2067                                 {
   2068                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
   2069 
   2070                                     // Gather data from the SVG if provided.
   2071                                     simdscalari vViewportIdx = SIMD16::setzero_si();
   2072                                     simdscalari vRtIdx = SIMD16::setzero_si();
   2073                                     SIMD8::Vec4 svgAttrib[4];
   2074 
   2075                                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
   2076                                     {
   2077                                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
   2078                                     }
   2079 
   2080                                     if (state.backendState.readViewportArrayIndex)
   2081                                     {
   2082                                         vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
   2083 
   2084                                         // OOB VPAI indices => forced to zero.
   2085                                         vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
   2086                                         simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
   2087                                         simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
   2088                                         vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
   2089                                         tessPa.viewportArrayActive = true;
   2090                                     }
   2091                                     if (state.backendState.readRenderTargetArrayIndex)
   2092                                     {
   2093                                         vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
   2094                                         tessPa.rtArrayActive = true;
   2095                                     }
   2096 
   2097                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
   2098                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx);
   2099                                 }
   2100                             }
   2101                         }
   2102                     }
   2103                 }
   2104             } while (pa.NextPrim());
   2105 
   2106             if (IsIndexedT::value)
   2107             {
   2108                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
   2109             }
   2110             else
   2111             {
   2112                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
   2113             }
   2114 
   2115             i += KNOB_SIMD_WIDTH;
   2116         }
   2117         pa.Reset();
   2118     }
   2119 
   2120 #endif
   2121 
   2122     AR_END(FEProcessDraw, numPrims * work.numInstances);
   2123 }
   2124 
   2125 struct FEDrawChooser
   2126 {
   2127     typedef PFN_FE_WORK_FUNC FuncType;
   2128 
   2129     template <typename... ArgsB>
   2130     static FuncType GetFunc()
   2131     {
   2132         return ProcessDraw<ArgsB...>;
   2133     }
   2134 };
   2135 
   2136 
   2137 // Selector for correct templated Draw front-end function
   2138 PFN_FE_WORK_FUNC GetProcessDrawFunc(
   2139     bool IsIndexed,
   2140     bool IsCutIndexEnabled,
   2141     bool HasTessellation,
   2142     bool HasGeometryShader,
   2143     bool HasStreamOut,
   2144     bool HasRasterization)
   2145 {
   2146     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
   2147 }
   2148