Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file frontend.cpp
     24 *
     25 * @brief Implementation for Frontend which handles vertex processing,
     26 *        primitive assembly, clipping, binning, etc.
     27 *
     28 ******************************************************************************/
     29 
     30 #include "api.h"
     31 #include "frontend.h"
     32 #include "backend.h"
     33 #include "context.h"
     34 #include "rdtsc_core.h"
     35 #include "utils.h"
     36 #include "threads.h"
     37 #include "pa.h"
     38 #include "clip.h"
     39 #include "tilemgr.h"
     40 #include "tessellator.h"
     41 #include <limits>
     42 
     43 //////////////////////////////////////////////////////////////////////////
     44 /// @brief Helper macro to generate a bitmask
     45 static INLINE uint32_t GenMask(uint32_t numBits)
     46 {
     47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
     48     return ((1U << numBits) - 1);
     49 }
     50 
     51 //////////////////////////////////////////////////////////////////////////
     52 /// @brief FE handler for SwrSync.
     53 /// @param pContext - pointer to SWR context.
     54 /// @param pDC - pointer to draw context.
     55 /// @param workerId - thread's worker id. Even thread has a unique id.
     56 /// @param pUserData - Pointer to user data passed back to sync callback.
     57 /// @todo This should go away when we switch this to use compute threading.
     58 void ProcessSync(
     59     SWR_CONTEXT *pContext,
     60     DRAW_CONTEXT *pDC,
     61     uint32_t workerId,
     62     void *pUserData)
     63 {
     64     BE_WORK work;
     65     work.type = SYNC;
     66     work.pfnWork = ProcessSyncBE;
     67 
     68     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     69     pTileMgr->enqueue(0, 0, &work);
     70 }
     71 
     72 //////////////////////////////////////////////////////////////////////////
     73 /// @brief FE handler for SwrDestroyContext.
     74 /// @param pContext - pointer to SWR context.
     75 /// @param pDC - pointer to draw context.
     76 /// @param workerId - thread's worker id. Even thread has a unique id.
     77 /// @param pUserData - Pointer to user data passed back to sync callback.
     78 void ProcessShutdown(
     79     SWR_CONTEXT *pContext,
     80     DRAW_CONTEXT *pDC,
     81     uint32_t workerId,
     82     void *pUserData)
     83 {
     84     BE_WORK work;
     85     work.type = SHUTDOWN;
     86     work.pfnWork = ProcessShutdownBE;
     87 
     88     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     89     // Enqueue at least 1 work item for each worker thread
     90     // account for number of numa nodes
     91     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
     92 
     93     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
     94     {
     95         for (uint32_t n = 0; n < numNumaNodes; ++n)
     96         {
     97             pTileMgr->enqueue(i, n, &work);
     98         }
     99     }
    100 }
    101 
    102 //////////////////////////////////////////////////////////////////////////
    103 /// @brief FE handler for SwrClearRenderTarget.
    104 /// @param pContext - pointer to SWR context.
    105 /// @param pDC - pointer to draw context.
    106 /// @param workerId - thread's worker id. Even thread has a unique id.
    107 /// @param pUserData - Pointer to user data passed back to clear callback.
    108 /// @todo This should go away when we switch this to use compute threading.
    109 void ProcessClear(
    110     SWR_CONTEXT *pContext,
    111     DRAW_CONTEXT *pDC,
    112     uint32_t workerId,
    113     void *pUserData)
    114 {
    115     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
    116     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    117 
    118     // queue a clear to each macro tile
    119     // compute macro tile bounds for the specified rect
    120     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
    121     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
    122     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
    123     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
    124 
    125     BE_WORK work;
    126     work.type = CLEAR;
    127     work.pfnWork = ProcessClearBE;
    128     work.desc.clear = *pDesc;
    129 
    130     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
    131     {
    132         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
    133         {
    134             pTileMgr->enqueue(x, y, &work);
    135         }
    136     }
    137 }
    138 
    139 //////////////////////////////////////////////////////////////////////////
    140 /// @brief FE handler for SwrStoreTiles.
    141 /// @param pContext - pointer to SWR context.
    142 /// @param pDC - pointer to draw context.
    143 /// @param workerId - thread's worker id. Even thread has a unique id.
    144 /// @param pUserData - Pointer to user data passed back to callback.
    145 /// @todo This should go away when we switch this to use compute threading.
    146 void ProcessStoreTiles(
    147     SWR_CONTEXT *pContext,
    148     DRAW_CONTEXT *pDC,
    149     uint32_t workerId,
    150     void *pUserData)
    151 {
    152     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
    153     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    154     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
    155 
    156     // queue a store to each macro tile
    157     // compute macro tile bounds for the specified rect
    158     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
    159     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
    160     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
    161     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
    162 
    163     // store tiles
    164     BE_WORK work;
    165     work.type = STORETILES;
    166     work.pfnWork = ProcessStoreTilesBE;
    167     work.desc.storeTiles = *pDesc;
    168 
    169     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
    170     {
    171         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
    172         {
    173             pTileMgr->enqueue(x, y, &work);
    174         }
    175     }
    176 
    177     AR_END(FEProcessStoreTiles, 0);
    178 }
    179 
    180 //////////////////////////////////////////////////////////////////////////
    181 /// @brief FE handler for SwrInvalidateTiles.
    182 /// @param pContext - pointer to SWR context.
    183 /// @param pDC - pointer to draw context.
    184 /// @param workerId - thread's worker id. Even thread has a unique id.
    185 /// @param pUserData - Pointer to user data passed back to callback.
    186 /// @todo This should go away when we switch this to use compute threading.
    187 void ProcessDiscardInvalidateTiles(
    188     SWR_CONTEXT *pContext,
    189     DRAW_CONTEXT *pDC,
    190     uint32_t workerId,
    191     void *pUserData)
    192 {
    193     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
    194     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
    195     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    196 
    197     // compute macro tile bounds for the specified rect
    198     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
    199     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
    200     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
    201     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
    202 
    203     if (pDesc->fullTilesOnly == false)
    204     {
    205         // include partial tiles
    206         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
    207         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
    208         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
    209         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
    210     }
    211 
    212     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
    213     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
    214 
    215     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
    216     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
    217 
    218     // load tiles
    219     BE_WORK work;
    220     work.type = DISCARDINVALIDATETILES;
    221     work.pfnWork = ProcessDiscardInvalidateTilesBE;
    222     work.desc.discardInvalidateTiles = *pDesc;
    223 
    224     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
    225     {
    226         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
    227         {
    228             pTileMgr->enqueue(x, y, &work);
    229         }
    230     }
    231 
    232     AR_END(FEProcessInvalidateTiles, 0);
    233 }
    234 
    235 //////////////////////////////////////////////////////////////////////////
    236 /// @brief Computes the number of primitives given the number of verts.
    237 /// @param mode - primitive topology for draw operation.
    238 /// @param numPrims - number of vertices or indices for draw.
    239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
    240 uint32_t GetNumPrims(
    241     PRIMITIVE_TOPOLOGY mode,
    242     uint32_t numPrims)
    243 {
    244     switch (mode)
    245     {
    246     case TOP_POINT_LIST: return numPrims;
    247     case TOP_TRIANGLE_LIST: return numPrims / 3;
    248     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
    249     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
    250     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
    251     case TOP_QUAD_LIST: return numPrims / 4;
    252     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
    253     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
    254     case TOP_LINE_LIST: return numPrims / 2;
    255     case TOP_LINE_LOOP: return numPrims;
    256     case TOP_RECT_LIST: return numPrims / 3;
    257     case TOP_LINE_LIST_ADJ: return numPrims / 4;
    258     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
    259     case TOP_TRI_LIST_ADJ: return numPrims / 6;
    260     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
    261 
    262     case TOP_PATCHLIST_1:
    263     case TOP_PATCHLIST_2:
    264     case TOP_PATCHLIST_3:
    265     case TOP_PATCHLIST_4:
    266     case TOP_PATCHLIST_5:
    267     case TOP_PATCHLIST_6:
    268     case TOP_PATCHLIST_7:
    269     case TOP_PATCHLIST_8:
    270     case TOP_PATCHLIST_9:
    271     case TOP_PATCHLIST_10:
    272     case TOP_PATCHLIST_11:
    273     case TOP_PATCHLIST_12:
    274     case TOP_PATCHLIST_13:
    275     case TOP_PATCHLIST_14:
    276     case TOP_PATCHLIST_15:
    277     case TOP_PATCHLIST_16:
    278     case TOP_PATCHLIST_17:
    279     case TOP_PATCHLIST_18:
    280     case TOP_PATCHLIST_19:
    281     case TOP_PATCHLIST_20:
    282     case TOP_PATCHLIST_21:
    283     case TOP_PATCHLIST_22:
    284     case TOP_PATCHLIST_23:
    285     case TOP_PATCHLIST_24:
    286     case TOP_PATCHLIST_25:
    287     case TOP_PATCHLIST_26:
    288     case TOP_PATCHLIST_27:
    289     case TOP_PATCHLIST_28:
    290     case TOP_PATCHLIST_29:
    291     case TOP_PATCHLIST_30:
    292     case TOP_PATCHLIST_31:
    293     case TOP_PATCHLIST_32:
    294         return numPrims / (mode - TOP_PATCHLIST_BASE);
    295 
    296     case TOP_POLYGON:
    297     case TOP_POINT_LIST_BF:
    298     case TOP_LINE_STRIP_CONT:
    299     case TOP_LINE_STRIP_BF:
    300     case TOP_LINE_STRIP_CONT_BF:
    301     case TOP_TRIANGLE_FAN_NOSTIPPLE:
    302     case TOP_TRI_STRIP_REVERSE:
    303     case TOP_PATCHLIST_BASE:
    304     case TOP_UNKNOWN:
    305         SWR_ASSERT(false, "Unsupported topology: %d", mode);
    306         return 0;
    307     }
    308 
    309     return 0;
    310 }
    311 
    312 //////////////////////////////////////////////////////////////////////////
    313 /// @brief Computes the number of verts given the number of primitives.
    314 /// @param mode - primitive topology for draw operation.
    315 /// @param numPrims - number of primitives for draw.
    316 uint32_t GetNumVerts(
    317     PRIMITIVE_TOPOLOGY mode,
    318     uint32_t numPrims)
    319 {
    320     switch (mode)
    321     {
    322     case TOP_POINT_LIST: return numPrims;
    323     case TOP_TRIANGLE_LIST: return numPrims * 3;
    324     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
    325     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
    326     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
    327     case TOP_QUAD_LIST: return numPrims * 4;
    328     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
    329     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
    330     case TOP_LINE_LIST: return numPrims * 2;
    331     case TOP_LINE_LOOP: return numPrims;
    332     case TOP_RECT_LIST: return numPrims * 3;
    333     case TOP_LINE_LIST_ADJ: return numPrims * 4;
    334     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
    335     case TOP_TRI_LIST_ADJ: return numPrims * 6;
    336     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
    337 
    338     case TOP_PATCHLIST_1:
    339     case TOP_PATCHLIST_2:
    340     case TOP_PATCHLIST_3:
    341     case TOP_PATCHLIST_4:
    342     case TOP_PATCHLIST_5:
    343     case TOP_PATCHLIST_6:
    344     case TOP_PATCHLIST_7:
    345     case TOP_PATCHLIST_8:
    346     case TOP_PATCHLIST_9:
    347     case TOP_PATCHLIST_10:
    348     case TOP_PATCHLIST_11:
    349     case TOP_PATCHLIST_12:
    350     case TOP_PATCHLIST_13:
    351     case TOP_PATCHLIST_14:
    352     case TOP_PATCHLIST_15:
    353     case TOP_PATCHLIST_16:
    354     case TOP_PATCHLIST_17:
    355     case TOP_PATCHLIST_18:
    356     case TOP_PATCHLIST_19:
    357     case TOP_PATCHLIST_20:
    358     case TOP_PATCHLIST_21:
    359     case TOP_PATCHLIST_22:
    360     case TOP_PATCHLIST_23:
    361     case TOP_PATCHLIST_24:
    362     case TOP_PATCHLIST_25:
    363     case TOP_PATCHLIST_26:
    364     case TOP_PATCHLIST_27:
    365     case TOP_PATCHLIST_28:
    366     case TOP_PATCHLIST_29:
    367     case TOP_PATCHLIST_30:
    368     case TOP_PATCHLIST_31:
    369     case TOP_PATCHLIST_32:
    370         return numPrims * (mode - TOP_PATCHLIST_BASE);
    371 
    372     case TOP_POLYGON:
    373     case TOP_POINT_LIST_BF:
    374     case TOP_LINE_STRIP_CONT:
    375     case TOP_LINE_STRIP_BF:
    376     case TOP_LINE_STRIP_CONT_BF:
    377     case TOP_TRIANGLE_FAN_NOSTIPPLE:
    378     case TOP_TRI_STRIP_REVERSE:
    379     case TOP_PATCHLIST_BASE:
    380     case TOP_UNKNOWN:
    381         SWR_ASSERT(false, "Unsupported topology: %d", mode);
    382         return 0;
    383     }
    384 
    385     return 0;
    386 }
    387 
    388 //////////////////////////////////////////////////////////////////////////
    389 /// @brief Return number of verts per primitive.
    390 /// @param topology - topology
    391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
    392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
    393 {
    394     uint32_t numVerts = 0;
    395     switch (topology)
    396     {
    397     case TOP_POINT_LIST:
    398     case TOP_POINT_LIST_BF:
    399         numVerts = 1;
    400         break;
    401     case TOP_LINE_LIST:
    402     case TOP_LINE_STRIP:
    403     case TOP_LINE_LIST_ADJ:
    404     case TOP_LINE_LOOP:
    405     case TOP_LINE_STRIP_CONT:
    406     case TOP_LINE_STRIP_BF:
    407     case TOP_LISTSTRIP_ADJ:
    408         numVerts = 2;
    409         break;
    410     case TOP_TRIANGLE_LIST:
    411     case TOP_TRIANGLE_STRIP:
    412     case TOP_TRIANGLE_FAN:
    413     case TOP_TRI_LIST_ADJ:
    414     case TOP_TRI_STRIP_ADJ:
    415     case TOP_TRI_STRIP_REVERSE:
    416     case TOP_RECT_LIST:
    417         numVerts = 3;
    418         break;
    419     case TOP_QUAD_LIST:
    420     case TOP_QUAD_STRIP:
    421         numVerts = 4;
    422         break;
    423     case TOP_PATCHLIST_1:
    424     case TOP_PATCHLIST_2:
    425     case TOP_PATCHLIST_3:
    426     case TOP_PATCHLIST_4:
    427     case TOP_PATCHLIST_5:
    428     case TOP_PATCHLIST_6:
    429     case TOP_PATCHLIST_7:
    430     case TOP_PATCHLIST_8:
    431     case TOP_PATCHLIST_9:
    432     case TOP_PATCHLIST_10:
    433     case TOP_PATCHLIST_11:
    434     case TOP_PATCHLIST_12:
    435     case TOP_PATCHLIST_13:
    436     case TOP_PATCHLIST_14:
    437     case TOP_PATCHLIST_15:
    438     case TOP_PATCHLIST_16:
    439     case TOP_PATCHLIST_17:
    440     case TOP_PATCHLIST_18:
    441     case TOP_PATCHLIST_19:
    442     case TOP_PATCHLIST_20:
    443     case TOP_PATCHLIST_21:
    444     case TOP_PATCHLIST_22:
    445     case TOP_PATCHLIST_23:
    446     case TOP_PATCHLIST_24:
    447     case TOP_PATCHLIST_25:
    448     case TOP_PATCHLIST_26:
    449     case TOP_PATCHLIST_27:
    450     case TOP_PATCHLIST_28:
    451     case TOP_PATCHLIST_29:
    452     case TOP_PATCHLIST_30:
    453     case TOP_PATCHLIST_31:
    454     case TOP_PATCHLIST_32:
    455         numVerts = topology - TOP_PATCHLIST_BASE;
    456         break;
    457     default:
    458         SWR_ASSERT(false, "Unsupported topology: %d", topology);
    459         break;
    460     }
    461 
    462     if (includeAdjVerts)
    463     {
    464         switch (topology)
    465         {
    466         case TOP_LISTSTRIP_ADJ:
    467         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
    468         case TOP_TRI_STRIP_ADJ:
    469         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
    470         default: break;
    471         }
    472     }
    473 
    474     return numVerts;
    475 }
    476 
    477 //////////////////////////////////////////////////////////////////////////
    478 /// @brief Generate mask from remaining work.
    479 /// @param numWorkItems - Number of items being worked on by a SIMD.
    480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
    481 {
    482     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
    483     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
    484     return _simd_castps_si(vMask(mask));
    485 }
    486 
    487 //////////////////////////////////////////////////////////////////////////
    488 /// @brief StreamOut - Streams vertex data out to SO buffers.
    489 ///        Generally, we are only streaming out a SIMDs worth of triangles.
    490 /// @param pDC - pointer to draw context.
    491 /// @param workerId - thread's worker id. Even thread has a unique id.
    492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
    493 static void StreamOut(
    494     DRAW_CONTEXT* pDC,
    495     PA_STATE& pa,
    496     uint32_t workerId,
    497     uint32_t* pPrimData,
    498     uint32_t streamIndex)
    499 {
    500     SWR_CONTEXT *pContext = pDC->pContext;
    501 
    502     AR_BEGIN(FEStreamout, pDC->drawId);
    503 
    504     const API_STATE& state = GetApiState(pDC);
    505     const SWR_STREAMOUT_STATE &soState = state.soState;
    506 
    507     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
    508 
    509     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
    510     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
    511 
    512     SWR_STREAMOUT_CONTEXT soContext = { 0 };
    513 
    514     // Setup buffer state pointers.
    515     for (uint32_t i = 0; i < 4; ++i)
    516     {
    517         soContext.pBuffer[i] = &state.soBuffer[i];
    518     }
    519 
    520     uint32_t numPrims = pa.NumPrims();
    521     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
    522     {
    523         DWORD slot = 0;
    524         uint32_t soMask = soState.streamMasks[streamIndex];
    525 
    526         // Write all entries into primitive data buffer for SOS.
    527         while (_BitScanForward(&slot, soMask))
    528         {
    529             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
    530             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
    531             pa.AssembleSingle(paSlot, primIndex, attrib);
    532 
    533             // Attribute offset is relative offset from start of vertex.
    534             // Note that attributes start at slot 1 in the PA buffer. We need to write this
    535             // to prim data starting at slot 0. Which is why we do (slot - 1).
    536             // Also note: GL works slightly differently, and needs slot 0
    537             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
    538 
    539             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
    540             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
    541             {
    542                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
    543 
    544                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
    545             }
    546             soMask &= ~(1 << slot);
    547         }
    548 
    549         // Update pPrimData pointer
    550         soContext.pPrimData = pPrimData;
    551 
    552         // Call SOS
    553         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
    554         state.pfnSoFunc[streamIndex](soContext);
    555     }
    556 
    557     // Update SO write offset. The driver provides memory for the update.
    558     for (uint32_t i = 0; i < 4; ++i)
    559     {
    560         if (state.soBuffer[i].pWriteOffset)
    561         {
    562             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
    563         }
    564 
    565         if (state.soBuffer[i].soWriteEnable)
    566         {
    567             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
    568             pDC->dynState.SoWriteOffsetDirty[i] = true;
    569         }
    570     }
    571 
    572     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
    573     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
    574 
    575     AR_END(FEStreamout, 1);
    576 }
    577 
    578 //////////////////////////////////////////////////////////////////////////
    579 /// @brief Computes number of invocations. The current index represents
    580 ///        the start of the SIMD. The max index represents how much work
    581 ///        items are remaining. If there is less then a SIMD's xmin of work
    582 ///        then return the remaining amount of work.
    583 /// @param curIndex - The start index for the SIMD.
    584 /// @param maxIndex - The last index for all work items.
    585 static INLINE uint32_t GetNumInvocations(
    586     uint32_t curIndex,
    587     uint32_t maxIndex)
    588 {
    589     uint32_t remainder = (maxIndex - curIndex);
    590     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
    591 }
    592 
    593 //////////////////////////////////////////////////////////////////////////
    594 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
    595 ///        The geometry shader will loop over each active streamout buffer, assembling
    596 ///        primitives for the downstream stages. When multistream output is enabled,
    597 ///        the generated stream ID buffer from the GS needs to be converted to a cut
    598 ///        buffer for the primitive assembler.
    599 /// @param stream - stream id to generate the cut buffer for
    600 /// @param pStreamIdBase - pointer to the stream ID buffer
    601 /// @param numEmittedVerts - Number of total verts emitted by the GS
    602 /// @param pCutBuffer - output buffer to write cuts to
    603 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
    604 {
    605     SWR_ASSERT(stream < MAX_SO_STREAMS);
    606 
    607     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
    608     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
    609 
    610     for (uint32_t b = 0; b < numOutputBytes; ++b)
    611     {
    612         uint8_t curInputByte = pStreamIdBase[2*b];
    613         uint8_t outByte = 0;
    614         for (uint32_t i = 0; i < 4; ++i)
    615         {
    616             if ((curInputByte & 0x3) != stream)
    617             {
    618                 outByte |= (1 << i);
    619             }
    620             curInputByte >>= 2;
    621         }
    622 
    623         curInputByte = pStreamIdBase[2 * b + 1];
    624         for (uint32_t i = 0; i < 4; ++i)
    625         {
    626             if ((curInputByte & 0x3) != stream)
    627             {
    628                 outByte |= (1 << (i + 4));
    629             }
    630             curInputByte >>= 2;
    631         }
    632 
    633         *pCutBuffer++ = outByte;
    634     }
    635 }
    636 
    637 THREAD SWR_GS_CONTEXT tlsGsContext;
    638 
    639 //////////////////////////////////////////////////////////////////////////
    640 /// @brief Implements GS stage.
    641 /// @param pDC - pointer to draw context.
    642 /// @param workerId - thread's worker id. Even thread has a unique id.
    643 /// @param pa - The primitive assembly object.
    644 /// @param pGsOut - output stream for GS
    645 template <
    646     typename HasStreamOutT,
    647     typename HasRastT>
    648 static void GeometryShaderStage(
    649     DRAW_CONTEXT *pDC,
    650     uint32_t workerId,
    651     PA_STATE& pa,
    652     void* pGsOut,
    653     void* pCutBuffer,
    654     void* pStreamCutBuffer,
    655     uint32_t* pSoPrimData,
    656     simdscalari primID)
    657 {
    658     SWR_CONTEXT *pContext = pDC->pContext;
    659 
    660     AR_BEGIN(FEGeometryShader, pDC->drawId);
    661 
    662     const API_STATE& state = GetApiState(pDC);
    663     const SWR_GS_STATE* pState = &state.gsState;
    664 
    665     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
    666     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
    667 
    668     tlsGsContext.pStream = (uint8_t*)pGsOut;
    669     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
    670     tlsGsContext.PrimitiveID = primID;
    671 
    672     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
    673     simdvector attrib[MAX_ATTRIBUTES];
    674 
    675     // assemble all attributes for the input primitive
    676     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
    677     {
    678         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
    679         pa.Assemble(attribSlot, attrib);
    680 
    681         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
    682         {
    683             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
    684         }
    685     }
    686 
    687     // assemble position
    688     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
    689     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
    690     {
    691         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
    692     }
    693 
    694     const uint32_t vertexStride = sizeof(simdvertex);
    695     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
    696     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
    697     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
    698     uint32_t cutPrimStride;
    699     uint32_t cutInstanceStride;
    700 
    701     if (pState->isSingleStream)
    702     {
    703         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
    704         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
    705     }
    706     else
    707     {
    708         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
    709         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
    710     }
    711 
    712     // record valid prims from the frontend to avoid over binning the newly generated
    713     // prims from the GS
    714     uint32_t numInputPrims = pa.NumPrims();
    715 
    716     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
    717     {
    718         tlsGsContext.InstanceID = instance;
    719         tlsGsContext.mask = GenerateMask(numInputPrims);
    720 
    721         // execute the geometry shader
    722         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
    723 
    724         tlsGsContext.pStream += instanceStride;
    725         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
    726     }
    727 
    728     // set up new binner and state for the GS output topology
    729     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
    730     if (HasRastT::value)
    731     {
    732         switch (pState->outputTopology)
    733         {
    734         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
    735         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
    736         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
    737         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
    738         }
    739     }
    740 
    741     // foreach input prim:
    742     // - setup a new PA based on the emitted verts for that prim
    743     // - loop over the new verts, calling PA to assemble each prim
    744     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
    745     uint32_t* pPrimitiveId = (uint32_t*)&primID;
    746 
    747     uint32_t totalPrimsGenerated = 0;
    748     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
    749     {
    750         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
    751         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
    752         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
    753         {
    754             uint32_t numEmittedVerts = pVertexCount[inputPrim];
    755             if (numEmittedVerts == 0)
    756             {
    757                 continue;
    758             }
    759 
    760             uint8_t* pBase = pInstanceBase + instance * instanceStride;
    761             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
    762 
    763             uint32_t numAttribs = state.feNumAttributes;
    764 
    765             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
    766             {
    767                 bool processCutVerts = false;
    768 
    769                 uint8_t* pCutBuffer = pCutBase;
    770 
    771                 // assign default stream ID, only relevant when GS is outputting a single stream
    772                 uint32_t streamID = 0;
    773                 if (pState->isSingleStream)
    774                 {
    775                     processCutVerts = true;
    776                     streamID = pState->singleStreamID;
    777                     if (streamID != stream) continue;
    778                 }
    779                 else
    780                 {
    781                     // early exit if this stream is not enabled for streamout
    782                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
    783                     {
    784                         continue;
    785                     }
    786 
    787                     // multi-stream output, need to translate StreamID buffer to a cut buffer
    788                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
    789                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
    790                     processCutVerts = false;
    791                 }
    792 
    793                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
    794 
    795                 while (gsPa.GetNextStreamOutput())
    796                 {
    797                     do
    798                     {
    799                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
    800 
    801                         if (assemble)
    802                         {
    803                             totalPrimsGenerated += gsPa.NumPrims();
    804 
    805                             if (HasStreamOutT::value)
    806                             {
    807                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
    808                             }
    809 
    810                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
    811                             {
    812                                 simdscalari vPrimId;
    813                                 // pull primitiveID from the GS output if available
    814                                 if (state.gsState.emitsPrimitiveID)
    815                                 {
    816                                     simdvector primIdAttrib[3];
    817                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
    818                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
    819                                 }
    820                                 else
    821                                 {
    822                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
    823                                 }
    824 
    825                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
    826                                 simdscalari vViewPortIdx;
    827                                 if (state.gsState.emitsViewportArrayIndex)
    828                                 {
    829                                     simdvector vpiAttrib[3];
    830                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
    831 
    832                                     // OOB indices => forced to zero.
    833                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
    834                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
    835                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
    836 
    837                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
    838                                 }
    839                                 else
    840                                 {
    841                                     vViewPortIdx = _simd_set1_epi32(0);
    842                                 }
    843 
    844                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
    845                             }
    846                         }
    847                     } while (gsPa.NextPrim());
    848                 }
    849             }
    850         }
    851     }
    852 
    853     // update GS pipeline stats
    854     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
    855     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
    856 	AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
    857     AR_END(FEGeometryShader, 1);
    858 }
    859 
    860 //////////////////////////////////////////////////////////////////////////
    861 /// @brief Allocate GS buffers
    862 /// @param pDC - pointer to draw context.
    863 /// @param state - API state
    864 /// @param ppGsOut - pointer to GS output buffer allocation
    865 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
    866 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
    867     void **ppStreamCutBuffer)
    868 {
    869     auto pArena = pDC->pArena;
    870     SWR_ASSERT(pArena != nullptr);
    871     SWR_ASSERT(state.gsState.gsEnable);
    872     // allocate arena space to hold GS output verts
    873     // @todo pack attribs
    874     // @todo support multiple streams
    875     const uint32_t vertexStride = sizeof(simdvertex);
    876     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
    877     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
    878     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
    879 
    880     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
    881     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
    882     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
    883     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
    884 
    885     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
    886     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
    887 
    888     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
    889     if (state.gsState.isSingleStream)
    890     {
    891         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
    892         *ppStreamCutBuffer = nullptr;
    893     }
    894     else
    895     {
    896         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
    897         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
    898     }
    899 
    900 }
    901 
    902 //////////////////////////////////////////////////////////////////////////
    903 /// @brief Contains all data generated by the HS and passed to the
    904 /// tessellator and DS.
    905 struct TessellationThreadLocalData
    906 {
    907     SWR_HS_CONTEXT hsContext;
    908     ScalarPatch patchData[KNOB_SIMD_WIDTH];
    909     void* pTxCtx;
    910     size_t tsCtxSize;
    911 
    912     simdscalar* pDSOutput;
    913     size_t numDSOutputVectors;
    914 };
    915 
    916 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
    917 
    918 //////////////////////////////////////////////////////////////////////////
    919 /// @brief Allocate tessellation data for this worker thread.
    920 INLINE
    921 static void AllocateTessellationData(SWR_CONTEXT* pContext)
    922 {
    923     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
    924     if (gt_pTessellationThreadData == nullptr)
    925     {
    926         gt_pTessellationThreadData = (TessellationThreadLocalData*)
    927             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
    928         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
    929     }
    930 }
    931 
    932 //////////////////////////////////////////////////////////////////////////
    933 /// @brief Implements Tessellation Stages.
    934 /// @param pDC - pointer to draw context.
    935 /// @param workerId - thread's worker id. Even thread has a unique id.
    936 /// @param pa - The primitive assembly object.
    937 /// @param pGsOut - output stream for GS
    938 template <
    939     typename HasGeometryShaderT,
    940     typename HasStreamOutT,
    941     typename HasRastT>
    942 static void TessellationStages(
    943     DRAW_CONTEXT *pDC,
    944     uint32_t workerId,
    945     PA_STATE& pa,
    946     void* pGsOut,
    947     void* pCutBuffer,
    948     void* pCutStreamBuffer,
    949     uint32_t* pSoPrimData,
    950     simdscalari primID)
    951 {
    952     SWR_CONTEXT *pContext = pDC->pContext;
    953     const API_STATE& state = GetApiState(pDC);
    954     const SWR_TS_STATE& tsState = state.tsState;
    955 
    956     SWR_ASSERT(gt_pTessellationThreadData);
    957 
    958     HANDLE tsCtx = TSInitCtx(
    959         tsState.domain,
    960         tsState.partitioning,
    961         tsState.tsOutputTopology,
    962         gt_pTessellationThreadData->pTxCtx,
    963         gt_pTessellationThreadData->tsCtxSize);
    964     if (tsCtx == nullptr)
    965     {
    966         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
    967         tsCtx = TSInitCtx(
    968             tsState.domain,
    969             tsState.partitioning,
    970             tsState.tsOutputTopology,
    971             gt_pTessellationThreadData->pTxCtx,
    972             gt_pTessellationThreadData->tsCtxSize);
    973     }
    974     SWR_ASSERT(tsCtx);
    975 
    976     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
    977     if (HasRastT::value)
    978     {
    979         switch (tsState.postDSTopology)
    980         {
    981         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
    982         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
    983         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
    984         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
    985         }
    986     }
    987 
    988     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
    989     hsContext.pCPout = gt_pTessellationThreadData->patchData;
    990     hsContext.PrimitiveID = primID;
    991 
    992     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
    993     // Max storage for one attribute for an entire simdprimitive
    994     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
    995 
    996     // assemble all attributes for the input primitives
    997     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
    998     {
    999         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
   1000         pa.Assemble(attribSlot, simdattrib);
   1001 
   1002         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
   1003         {
   1004             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
   1005         }
   1006     }
   1007 
   1008 #if defined(_DEBUG)
   1009     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
   1010 #endif
   1011 
   1012     uint32_t numPrims = pa.NumPrims();
   1013     hsContext.mask = GenerateMask(numPrims);
   1014 
   1015     // Run the HS
   1016     AR_BEGIN(FEHullShader, pDC->drawId);
   1017     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
   1018     AR_END(FEHullShader, 0);
   1019 
   1020     UPDATE_STAT_FE(HsInvocations, numPrims);
   1021 
   1022     const uint32_t* pPrimId = (const uint32_t*)&primID;
   1023 
   1024     for (uint32_t p = 0; p < numPrims; ++p)
   1025     {
   1026         // Run Tessellator
   1027         SWR_TS_TESSELLATED_DATA tsData = { 0 };
   1028         AR_BEGIN(FETessellation, pDC->drawId);
   1029         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
   1030 		AR_EVENT(TessPrimCount(1));
   1031         AR_END(FETessellation, 0);
   1032 
   1033         if (tsData.NumPrimitives == 0)
   1034         {
   1035             continue;
   1036         }
   1037         SWR_ASSERT(tsData.NumDomainPoints);
   1038 
   1039         // Allocate DS Output memory
   1040         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
   1041         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
   1042         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
   1043         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
   1044         {
   1045             AlignedFree(gt_pTessellationThreadData->pDSOutput);
   1046             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
   1047             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
   1048         }
   1049         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
   1050         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
   1051 
   1052 #if defined(_DEBUG)
   1053         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
   1054 #endif
   1055 
   1056         // Run Domain Shader
   1057         SWR_DS_CONTEXT dsContext;
   1058         dsContext.PrimitiveID = pPrimId[p];
   1059         dsContext.pCpIn = &hsContext.pCPout[p];
   1060         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
   1061         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
   1062         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
   1063         dsContext.vectorStride = requiredDSVectorInvocations;
   1064 
   1065         uint32_t dsInvocations = 0;
   1066 
   1067         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
   1068         {
   1069             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
   1070 
   1071             AR_BEGIN(FEDomainShader, pDC->drawId);
   1072             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
   1073             AR_END(FEDomainShader, 0);
   1074 
   1075             dsInvocations += KNOB_SIMD_WIDTH;
   1076         }
   1077         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
   1078 
   1079         PA_TESS tessPa(
   1080             pDC,
   1081             dsContext.pOutputData,
   1082             dsContext.vectorStride,
   1083             tsState.numDsOutputAttribs,
   1084             tsData.ppIndices,
   1085             tsData.NumPrimitives,
   1086             tsState.postDSTopology);
   1087 
   1088         while (tessPa.HasWork())
   1089         {
   1090             if (HasGeometryShaderT::value)
   1091             {
   1092                 GeometryShaderStage<HasStreamOutT, HasRastT>(
   1093                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
   1094                     _simd_set1_epi32(dsContext.PrimitiveID));
   1095             }
   1096             else
   1097             {
   1098                 if (HasStreamOutT::value)
   1099                 {
   1100                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
   1101                 }
   1102 
   1103                 if (HasRastT::value)
   1104                 {
   1105                     simdvector prim[3]; // Only deal with triangles, lines, or points
   1106                     AR_BEGIN(FEPAAssemble, pDC->drawId);
   1107 #if SWR_ENABLE_ASSERTS
   1108                     bool assemble =
   1109 #endif
   1110                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
   1111                     AR_END(FEPAAssemble, 1);
   1112                     SWR_ASSERT(assemble);
   1113 
   1114                     SWR_ASSERT(pfnClipFunc);
   1115                     pfnClipFunc(pDC, tessPa, workerId, prim,
   1116                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
   1117                 }
   1118             }
   1119 
   1120             tessPa.NextPrim();
   1121 
   1122         } // while (tessPa.HasWork())
   1123     } // for (uint32_t p = 0; p < numPrims; ++p)
   1124 
   1125     TSDestroyCtx(tsCtx);
   1126 }
   1127 
   1128 //////////////////////////////////////////////////////////////////////////
   1129 /// @brief FE handler for SwrDraw.
   1130 /// @tparam IsIndexedT - Is indexed drawing enabled
   1131 /// @tparam HasTessellationT - Is tessellation enabled
   1132 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
   1133 /// @tparam HasStreamOutT - Is stream-out enabled
   1134 /// @tparam HasRastT - Is rasterization enabled
   1135 /// @param pContext - pointer to SWR context.
   1136 /// @param pDC - pointer to draw context.
   1137 /// @param workerId - thread's worker id.
   1138 /// @param pUserData - Pointer to DRAW_WORK
   1139 template <
   1140     typename IsIndexedT,
   1141     typename IsCutIndexEnabledT,
   1142     typename HasTessellationT,
   1143     typename HasGeometryShaderT,
   1144     typename HasStreamOutT,
   1145     typename HasRastT>
   1146 void ProcessDraw(
   1147     SWR_CONTEXT *pContext,
   1148     DRAW_CONTEXT *pDC,
   1149     uint32_t workerId,
   1150     void *pUserData)
   1151 {
   1152 
   1153 #if KNOB_ENABLE_TOSS_POINTS
   1154     if (KNOB_TOSS_QUEUE_FE)
   1155     {
   1156         return;
   1157     }
   1158 #endif
   1159 
   1160     AR_BEGIN(FEProcessDraw, pDC->drawId);
   1161 
   1162     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
   1163     const API_STATE&    state = GetApiState(pDC);
   1164     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   1165     SWR_VS_CONTEXT      vsContext;
   1166     simdvertex          vin;
   1167 
   1168     int indexSize = 0;
   1169     uint32_t endVertex = work.numVerts;
   1170 
   1171     const int32_t* pLastRequestedIndex = nullptr;
   1172     if (IsIndexedT::value)
   1173     {
   1174         switch (work.type)
   1175         {
   1176         case R32_UINT:
   1177             indexSize = sizeof(uint32_t);
   1178             pLastRequestedIndex = &(work.pIB[endVertex]);
   1179             break;
   1180         case R16_UINT:
   1181             indexSize = sizeof(uint16_t);
   1182             // nasty address offset to last index
   1183             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
   1184             break;
   1185         case R8_UINT:
   1186             indexSize = sizeof(uint8_t);
   1187             // nasty address offset to last index
   1188             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
   1189             break;
   1190         default:
   1191             SWR_ASSERT(0);
   1192         }
   1193     }
   1194     else
   1195     {
   1196         // No cuts, prune partial primitives.
   1197         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
   1198     }
   1199 
   1200     SWR_FETCH_CONTEXT fetchInfo = { 0 };
   1201     fetchInfo.pStreams = &state.vertexBuffers[0];
   1202     fetchInfo.StartInstance = work.startInstance;
   1203     fetchInfo.StartVertex = 0;
   1204 
   1205     vsContext.pVin = &vin;
   1206 
   1207     if (IsIndexedT::value)
   1208     {
   1209         fetchInfo.BaseVertex = work.baseVertex;
   1210 
   1211         // if the entire index buffer isn't being consumed, set the last index
   1212         // so that fetches < a SIMD wide will be masked off
   1213         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
   1214         if (pLastRequestedIndex < fetchInfo.pLastIndex)
   1215         {
   1216             fetchInfo.pLastIndex = pLastRequestedIndex;
   1217         }
   1218     }
   1219     else
   1220     {
   1221         fetchInfo.StartVertex = work.startVertex;
   1222     }
   1223 
   1224 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
   1225     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
   1226 #endif
   1227 
   1228     void* pGsOut = nullptr;
   1229     void* pCutBuffer = nullptr;
   1230     void* pStreamCutBuffer = nullptr;
   1231     if (HasGeometryShaderT::value)
   1232     {
   1233         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
   1234     }
   1235 
   1236     if (HasTessellationT::value)
   1237     {
   1238         SWR_ASSERT(state.tsState.tsEnable == true);
   1239         SWR_ASSERT(state.pfnHsFunc != nullptr);
   1240         SWR_ASSERT(state.pfnDsFunc != nullptr);
   1241 
   1242         AllocateTessellationData(pContext);
   1243     }
   1244     else
   1245     {
   1246         SWR_ASSERT(state.tsState.tsEnable == false);
   1247         SWR_ASSERT(state.pfnHsFunc == nullptr);
   1248         SWR_ASSERT(state.pfnDsFunc == nullptr);
   1249     }
   1250 
   1251     // allocate space for streamout input prim data
   1252     uint32_t* pSoPrimData = nullptr;
   1253     if (HasStreamOutT::value)
   1254     {
   1255         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
   1256     }
   1257 
   1258     // choose primitive assembler
   1259     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
   1260     PA_STATE& pa = paFactory.GetPA();
   1261 
   1262     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
   1263     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
   1264     {
   1265         simdscalari vIndex;
   1266         uint32_t  i = 0;
   1267 
   1268         if (IsIndexedT::value)
   1269         {
   1270             fetchInfo.pIndices = work.pIB;
   1271         }
   1272         else
   1273         {
   1274             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
   1275             fetchInfo.pIndices = (const int32_t*)&vIndex;
   1276         }
   1277 
   1278         fetchInfo.CurInstance = instanceNum;
   1279         vsContext.InstanceID = instanceNum;
   1280 
   1281         while (pa.HasWork())
   1282         {
   1283             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
   1284             // So we need to keep this outside of (i < endVertex) check.
   1285             simdmask* pvCutIndices = nullptr;
   1286             if (IsIndexedT::value)
   1287             {
   1288                 pvCutIndices = &pa.GetNextVsIndices();
   1289             }
   1290 
   1291             simdvertex& vout = pa.GetNextVsOutput();
   1292             vsContext.pVout = &vout;
   1293 
   1294             if (i < endVertex)
   1295             {
   1296 
   1297                 // 1. Execute FS/VS for a single SIMD.
   1298                 AR_BEGIN(FEFetchShader, pDC->drawId);
   1299                 state.pfnFetchFunc(fetchInfo, vin);
   1300                 AR_END(FEFetchShader, 0);
   1301 
   1302                 // forward fetch generated vertex IDs to the vertex shader
   1303                 vsContext.VertexID = fetchInfo.VertexID;
   1304 
   1305                 // Setup active mask for vertex shader.
   1306                 vsContext.mask = GenerateMask(endVertex - i);
   1307 
   1308                 // forward cut mask to the PA
   1309                 if (IsIndexedT::value)
   1310                 {
   1311                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
   1312                 }
   1313 
   1314                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
   1315 
   1316 #if KNOB_ENABLE_TOSS_POINTS
   1317                 if (!KNOB_TOSS_FETCH)
   1318 #endif
   1319                 {
   1320                     AR_BEGIN(FEVertexShader, pDC->drawId);
   1321                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
   1322                     AR_END(FEVertexShader, 0);
   1323 
   1324                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
   1325                 }
   1326             }
   1327 
   1328             // 2. Assemble primitives given the last two SIMD.
   1329             do
   1330             {
   1331                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
   1332                 // PaAssemble returns false if there is not enough verts to assemble.
   1333                 AR_BEGIN(FEPAAssemble, pDC->drawId);
   1334                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
   1335                 AR_END(FEPAAssemble, 1);
   1336 
   1337 #if KNOB_ENABLE_TOSS_POINTS
   1338                 if (!KNOB_TOSS_FETCH)
   1339 #endif
   1340                 {
   1341 #if KNOB_ENABLE_TOSS_POINTS
   1342                     if (!KNOB_TOSS_VS)
   1343 #endif
   1344                     {
   1345                         if (assemble)
   1346                         {
   1347                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
   1348 
   1349                             if (HasTessellationT::value)
   1350                             {
   1351                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
   1352                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
   1353                             }
   1354                             else if (HasGeometryShaderT::value)
   1355                             {
   1356                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
   1357                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
   1358                             }
   1359                             else
   1360                             {
   1361                                 // If streamout is enabled then stream vertices out to memory.
   1362                                 if (HasStreamOutT::value)
   1363                                 {
   1364                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
   1365                                 }
   1366 
   1367                                 if (HasRastT::value)
   1368                                 {
   1369                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
   1370                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
   1371                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
   1372                                 }
   1373                             }
   1374                         }
   1375                     }
   1376                 }
   1377             } while (pa.NextPrim());
   1378 
   1379             i += KNOB_SIMD_WIDTH;
   1380             if (IsIndexedT::value)
   1381             {
   1382                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
   1383             }
   1384             else
   1385             {
   1386                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
   1387             }
   1388         }
   1389         pa.Reset();
   1390     }
   1391 
   1392 
   1393     AR_END(FEProcessDraw, numPrims * work.numInstances);
   1394 }
   1395 
   1396 struct FEDrawChooser
   1397 {
   1398     typedef PFN_FE_WORK_FUNC FuncType;
   1399 
   1400     template <typename... ArgsB>
   1401     static FuncType GetFunc()
   1402     {
   1403         return ProcessDraw<ArgsB...>;
   1404     }
   1405 };
   1406 
   1407 
   1408 // Selector for correct templated Draw front-end function
   1409 PFN_FE_WORK_FUNC GetProcessDrawFunc(
   1410     bool IsIndexed,
   1411     bool IsCutIndexEnabled,
   1412     bool HasTessellation,
   1413     bool HasGeometryShader,
   1414     bool HasStreamOut,
   1415     bool HasRasterization)
   1416 {
   1417     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
   1418 }