Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file backend.cpp
     24 *
     25 * @brief Backend handles rasterization, pixel shading and output merger
     26 *        operations.
     27 *
     28 ******************************************************************************/
     29 
     30 #include <smmintrin.h>
     31 
     32 #include "backend.h"
     33 #include "depthstencil.h"
     34 #include "tilemgr.h"
     35 #include "memory/tilingtraits.h"
     36 #include "core/multisample.h"
     37 
     38 #include <algorithm>
     39 
     40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
     41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
     42 
     43 //////////////////////////////////////////////////////////////////////////
     44 /// @brief Process compute work.
     45 /// @param pDC - pointer to draw context (dispatch).
     46 /// @param workerId - The unique worker ID that is assigned to this thread.
     47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
     48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
     49 {
     50     SWR_CONTEXT *pContext = pDC->pContext;
     51 
     52     AR_BEGIN(BEDispatch, pDC->drawId);
     53 
     54     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
     55     SWR_ASSERT(pTaskData != nullptr);
     56 
     57     // Ensure spill fill memory has been allocated.
     58     size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
     59     if (spillFillSize && pSpillFillBuffer == nullptr)
     60     {
     61         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
     62     }
     63 
     64     const API_STATE& state = GetApiState(pDC);
     65 
     66     SWR_CS_CONTEXT csContext{ 0 };
     67     csContext.tileCounter = threadGroupId;
     68     csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
     69     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
     70     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     71     csContext.pTGSM = pContext->ppScratch[workerId];
     72     csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
     73 
     74     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
     75 
     76     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
     77 
     78     AR_END(BEDispatch, 1);
     79 }
     80 
     81 //////////////////////////////////////////////////////////////////////////
     82 /// @brief Process shutdown.
     83 /// @param pDC - pointer to draw context (dispatch).
     84 /// @param workerId - The unique worker ID that is assigned to this thread.
     85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
     86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
     87 {
     88     // Dummy function
     89 }
     90 
     91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
     92 {
     93     uint32_t x, y;
     94     MacroTileMgr::getTileIndices(macroTile, x, y);
     95     SWR_ASSERT(x == 0 && y == 0);
     96 }
     97 
     98 template<SWR_FORMAT format>
     99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
    100 {
    101     auto lambda = [&](int32_t comp)
    102     {
    103         FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
    104 
    105         pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
    106     };
    107 
    108     const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
    109 
    110     for (uint32_t i = 0; i < numIter; ++i)
    111     {
    112         UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
    113     }
    114 }
    115 
    116 #if USE_8x2_TILE_BACKEND
    117 template<SWR_FORMAT format>
    118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
    119 {
    120     auto lambda = [&](int32_t comp)
    121     {
    122         FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
    123 
    124         pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
    125     };
    126 
    127     const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
    128 
    129     for (uint32_t i = 0; i < numIter; ++i)
    130     {
    131         UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
    132     }
    133 }
    134 
    135 #endif
    136 template<SWR_FORMAT format>
    137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
    138 {
    139     // convert clear color to hottile format
    140     // clear color is in RGBA float/uint32
    141 #if USE_8x2_TILE_BACKEND
    142     simd16vector vClear;
    143     for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
    144     {
    145         simd16scalar vComp;
    146         vComp = _simd16_load1_ps((const float*)&clear[comp]);
    147         if (FormatTraits<format>::isNormalized(comp))
    148         {
    149             vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
    150             vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
    151         }
    152         vComp = FormatTraits<format>::pack(comp, vComp);
    153         vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
    154     }
    155 
    156 #else
    157     simdvector vClear;
    158     for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
    159     {
    160         simdscalar vComp;
    161         vComp = _simd_load1_ps((const float*)&clear[comp]);
    162         if (FormatTraits<format>::isNormalized(comp))
    163         {
    164             vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
    165             vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
    166         }
    167         vComp = FormatTraits<format>::pack(comp, vComp);
    168         vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
    169     }
    170 
    171 #endif
    172     uint32_t tileX, tileY;
    173     MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
    174 
    175     // Init to full macrotile
    176     SWR_RECT clearTile =
    177     {
    178         KNOB_MACROTILE_X_DIM * int32_t(tileX),
    179         KNOB_MACROTILE_Y_DIM * int32_t(tileY),
    180         KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
    181         KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
    182     };
    183 
    184     // intersect with clear rect
    185     clearTile &= rect;
    186 
    187     // translate to local hottile origin
    188     clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
    189 
    190     // Make maximums inclusive (needed for convert to raster tiles)
    191     clearTile.xmax -= 1;
    192     clearTile.ymax -= 1;
    193 
    194     // convert to raster tiles
    195     clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
    196     clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
    197     clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
    198     clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
    199 
    200     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
    201     // compute steps between raster tile samples / raster tiles / macro tile rows
    202     const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
    203     const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
    204     const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
    205     const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
    206 
    207     HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
    208     uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
    209     uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
    210 
    211     // loop over all raster tiles in the current hot tile
    212     for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
    213     {
    214         uint8_t* pRasterTile = pRasterTileRow;
    215         for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
    216         {
    217             for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
    218             {
    219                 ClearRasterTile<format>(pRasterTile, vClear);
    220                 pRasterTile += rasterTileSampleStep;
    221             }
    222         }
    223         pRasterTileRow += macroTileRowStep;
    224     }
    225 
    226     pHotTile->state = HOTTILE_DIRTY;
    227 }
    228 
    229 
    230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
    231 {
    232     SWR_CONTEXT *pContext = pDC->pContext;
    233 
    234     if (KNOB_FAST_CLEAR)
    235     {
    236         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
    237         SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
    238         uint32_t numSamples = GetNumSamples(sampleCount);
    239 
    240         SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
    241 
    242         AR_BEGIN(BEClear, pDC->drawId);
    243 
    244         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
    245         {
    246             unsigned long rt = 0;
    247             uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
    248             while (_BitScanForward(&rt, mask))
    249             {
    250                 mask &= ~(1 << rt);
    251 
    252                 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
    253 
    254                 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
    255                 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
    256                 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
    257                 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
    258                 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
    259                 pHotTile->state = HOTTILE_CLEAR;
    260             }
    261         }
    262 
    263         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
    264         {
    265             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
    266             pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
    267             pHotTile->state = HOTTILE_CLEAR;
    268         }
    269 
    270         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
    271         {
    272             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
    273 
    274             pHotTile->clearData[0] = pClear->clearStencil;
    275             pHotTile->state = HOTTILE_CLEAR;
    276         }
    277 
    278         AR_END(BEClear, 1);
    279     }
    280     else
    281     {
    282         // Legacy clear
    283         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
    284         AR_BEGIN(BEClear, pDC->drawId);
    285 
    286         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
    287         {
    288             DWORD clearData[4];
    289             clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
    290             clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
    291             clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
    292             clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
    293 
    294             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
    295             SWR_ASSERT(pfnClearTiles != nullptr);
    296 
    297             unsigned long rt = 0;
    298             uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
    299             while (_BitScanForward(&rt, mask))
    300             {
    301                 mask &= ~(1 << rt);
    302 
    303                 pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
    304             }
    305         }
    306 
    307         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
    308         {
    309             DWORD clearData[4];
    310             clearData[0] = *(DWORD*)&pClear->clearDepth;
    311             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
    312             SWR_ASSERT(pfnClearTiles != nullptr);
    313 
    314             pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
    315         }
    316 
    317         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
    318         {
    319             DWORD clearData[4];
    320             clearData[0] = pClear->clearStencil;
    321             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
    322 
    323             pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
    324         }
    325 
    326         AR_END(BEClear, 1);
    327     }
    328 }
    329 
    330 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
    331     SWR_RENDERTARGET_ATTACHMENT attachment)
    332 {
    333     SWR_CONTEXT *pContext = pDC->pContext;
    334 
    335     AR_BEGIN(BEStoreTiles, pDC->drawId);
    336 
    337     SWR_FORMAT srcFormat;
    338     switch (attachment)
    339     {
    340     case SWR_ATTACHMENT_COLOR0:
    341     case SWR_ATTACHMENT_COLOR1:
    342     case SWR_ATTACHMENT_COLOR2:
    343     case SWR_ATTACHMENT_COLOR3:
    344     case SWR_ATTACHMENT_COLOR4:
    345     case SWR_ATTACHMENT_COLOR5:
    346     case SWR_ATTACHMENT_COLOR6:
    347     case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    348     case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
    349     case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
    350     default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    351     }
    352 
    353     uint32_t x, y;
    354     MacroTileMgr::getTileIndices(macroTile, x, y);
    355 
    356     // Only need to store the hottile if it's been rendered to...
    357     HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
    358     if (pHotTile)
    359     {
    360         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
    361         if (pHotTile->state == HOTTILE_CLEAR)
    362         {
    363             PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
    364             SWR_ASSERT(pfnClearTiles != nullptr);
    365 
    366             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
    367         }
    368 
    369         if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
    370         {
    371             int32_t destX = KNOB_MACROTILE_X_DIM * x;
    372             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
    373 
    374             pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
    375                 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
    376         }
    377 
    378 
    379         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
    380         {
    381             pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
    382         }
    383     }
    384     AR_END(BEStoreTiles, 1);
    385 }
    386 
    387 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
    388 {
    389     STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
    390 
    391     unsigned long rt = 0;
    392     uint32_t mask = pDesc->attachmentMask;
    393     while (_BitScanForward(&rt, mask))
    394     {
    395         mask &= ~(1 << rt);
    396         ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
    397     }
    398 }
    399 
    400 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
    401 {
    402     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
    403     SWR_CONTEXT *pContext = pDC->pContext;
    404 
    405     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
    406 
    407     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
    408     {
    409         if (pDesc->attachmentMask & (1 << i))
    410         {
    411             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
    412                 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
    413             if (pHotTile)
    414             {
    415                 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
    416             }
    417         }
    418     }
    419 }
    420 
    421 #if KNOB_SIMD_WIDTH == 8
    422 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
    423 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
    424 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
    425 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
    426 #else
    427 #error Unsupported vector width
    428 #endif
    429 
    430 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
    431 {
    432     simdscalar vClipMask = _simd_setzero_ps();
    433     uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
    434 
    435     for (uint32_t i = 0; i < numClipDistance; ++i)
    436     {
    437         // pull triangle clip distance values from clip buffer
    438         simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
    439         simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
    440         simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
    441 
    442         // interpolate
    443         simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
    444 
    445         // clip if interpolated clip distance is < 0 || NAN
    446         simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
    447 
    448         vClipMask = _simd_or_ps(vClipMask, vCull);
    449     }
    450 
    451     return _simd_movemask_ps(vClipMask);
    452 }
    453 
    454 template<typename T>
    455 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
    456 {
    457     SWR_CONTEXT *pContext = pDC->pContext;
    458 
    459     AR_BEGIN(BESingleSampleBackend, pDC->drawId);
    460     AR_BEGIN(BESetup, pDC->drawId);
    461 
    462     const API_STATE &state = GetApiState(pDC);
    463 
    464     BarycentricCoeffs coeffs;
    465     SetupBarycentricCoeffs(&coeffs, work);
    466 
    467     uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
    468     SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
    469 
    470     SWR_PS_CONTEXT psContext;
    471     SetupPixelShaderContext<T>(&psContext, work);
    472 
    473     AR_END(BESetup, 1);
    474 
    475     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
    476     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    477 
    478     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    479 
    480     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    481     {
    482         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
    483         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
    484 
    485         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
    486 
    487         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
    488         {
    489 #if USE_8x2_TILE_BACKEND
    490             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
    491 
    492 #endif
    493             simdmask coverageMask = work.coverageMask[0] & MASK;
    494 
    495             if (coverageMask)
    496             {
    497                 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
    498                 {
    499                     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
    500 
    501                     const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
    502 
    503                     const float minz = state.depthBoundsState.depthBoundsTestMinValue;
    504                     const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
    505 
    506                     coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
    507                 }
    508 
    509                 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
    510                 {
    511                     const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
    512 
    513                     generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
    514                 }
    515 
    516                 AR_BEGIN(BEBarycentric, pDC->drawId);
    517 
    518                 CalcPixelBarycentrics(coeffs, psContext);
    519 
    520                 CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
    521 
    522                 // interpolate and quantize z
    523                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
    524                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    525 
    526                 AR_END(BEBarycentric, 1);
    527 
    528                 // interpolate user clip distance if available
    529                 if (state.rastState.clipDistanceMask)
    530                 {
    531                     coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
    532                 }
    533 
    534                 simdscalar vCoverageMask = vMask(coverageMask);
    535                 simdscalar depthPassMask = vCoverageMask;
    536                 simdscalar stencilPassMask = vCoverageMask;
    537 
    538                 // Early-Z?
    539                 if (T::bCanEarlyZ)
    540                 {
    541                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
    542                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    543                                                      psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
    544                     AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
    545                     AR_END(BEEarlyDepthTest, 0);
    546 
    547                     // early-exit if no pixels passed depth or earlyZ is forced on
    548                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
    549                     {
    550                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    551                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
    552 
    553                         if (!_simd_movemask_ps(depthPassMask))
    554                         {
    555                             goto Endtile;
    556                         }
    557                     }
    558                 }
    559 
    560                 psContext.sampleIndex = 0;
    561                 psContext.activeMask = _simd_castps_si(vCoverageMask);
    562 
    563                 // execute pixel shader
    564                 AR_BEGIN(BEPixelShader, pDC->drawId);
    565                 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
    566                 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
    567                 AR_END(BEPixelShader, 0);
    568 
    569                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
    570 
    571                 // late-Z
    572                 if (!T::bCanEarlyZ)
    573                 {
    574                     AR_BEGIN(BELateDepthTest, pDC->drawId);
    575                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    576                                                         psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
    577                     AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
    578                     AR_END(BELateDepthTest, 0);
    579 
    580                     if (!_simd_movemask_ps(depthPassMask))
    581                     {
    582                         // need to call depth/stencil write for stencil write
    583                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    584                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
    585                         goto Endtile;
    586                     }
    587                 }
    588 
    589                 uint32_t statMask = _simd_movemask_ps(depthPassMask);
    590                 uint32_t statCount = _mm_popcnt_u32(statMask);
    591                 UPDATE_STAT_BE(DepthPassCount, statCount);
    592 
    593                 // output merger
    594                 AR_BEGIN(BEOutputMerger, pDC->drawId);
    595 #if USE_8x2_TILE_BACKEND
    596                 OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
    597 #else
    598                 OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
    599 #endif
    600 
    601                 // do final depth write after all pixel kills
    602                 if (!state.psState.forceEarlyZ)
    603                 {
    604                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    605                         pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
    606                 }
    607                 AR_END(BEOutputMerger, 0);
    608             }
    609 
    610 Endtile:
    611             AR_BEGIN(BEEndTile, pDC->drawId);
    612 
    613             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    614             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
    615             {
    616                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    617             }
    618 
    619 #if USE_8x2_TILE_BACKEND
    620             if (useAlternateOffset)
    621             {
    622                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
    623                 {
    624                     pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    625                 }
    626             }
    627 #else
    628             for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
    629             {
    630                 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    631             }
    632 #endif
    633             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
    634             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
    635 
    636             AR_END(BEEndTile, 0);
    637 
    638             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
    639             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
    640         }
    641 
    642         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
    643         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    644     }
    645 
    646     AR_END(BESingleSampleBackend, 0);
    647 }
    648 
    649 template<typename T>
    650 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
    651 {
    652     SWR_CONTEXT *pContext = pDC->pContext;
    653 
    654     AR_BEGIN(BESampleRateBackend, pDC->drawId);
    655     AR_BEGIN(BESetup, pDC->drawId);
    656 
    657     const API_STATE &state = GetApiState(pDC);
    658 
    659     BarycentricCoeffs coeffs;
    660     SetupBarycentricCoeffs(&coeffs, work);
    661 
    662     uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
    663     SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
    664 
    665     SWR_PS_CONTEXT psContext;
    666     SetupPixelShaderContext<T>(&psContext, work);
    667 
    668     AR_END(BESetup, 0);
    669 
    670     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
    671     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    672 
    673     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    674 
    675     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    676     {
    677         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
    678         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
    679 
    680         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
    681 
    682         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
    683         {
    684 #if USE_8x2_TILE_BACKEND
    685             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
    686 
    687 #endif
    688             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
    689             {
    690                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
    691 
    692                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
    693             }
    694 
    695             AR_BEGIN(BEBarycentric, pDC->drawId);
    696 
    697             CalcPixelBarycentrics(coeffs, psContext);
    698 
    699             CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
    700 
    701             AR_END(BEBarycentric, 0);
    702 
    703             for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
    704             {
    705                 simdmask coverageMask = work.coverageMask[sample] & MASK;
    706 
    707                 if (coverageMask)
    708                 {
    709                     // offset depth/stencil buffers current sample
    710                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
    711                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
    712 
    713                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
    714                     {
    715                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
    716 
    717                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
    718 
    719                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
    720                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
    721 
    722                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
    723                     }
    724 
    725                     AR_BEGIN(BEBarycentric, pDC->drawId);
    726 
    727                     // calculate per sample positions
    728                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
    729                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
    730 
    731                     CalcSampleBarycentrics(coeffs, psContext);
    732 
    733                     // interpolate and quantize z
    734                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
    735                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    736 
    737                     AR_END(BEBarycentric, 0);
    738 
    739                     // interpolate user clip distance if available
    740                     if (state.rastState.clipDistanceMask)
    741                     {
    742                         coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
    743                     }
    744 
    745                     simdscalar vCoverageMask = vMask(coverageMask);
    746                     simdscalar depthPassMask = vCoverageMask;
    747                     simdscalar stencilPassMask = vCoverageMask;
    748 
    749                     // Early-Z?
    750                     if (T::bCanEarlyZ)
    751                     {
    752                         AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
    753                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    754                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
    755                         AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
    756                         AR_END(BEEarlyDepthTest, 0);
    757 
    758                         // early-exit if no samples passed depth or earlyZ is forced on.
    759                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
    760                         {
    761                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    762                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    763 
    764                             if (!_simd_movemask_ps(depthPassMask))
    765                             {
    766                                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    767                                 continue;
    768                             }
    769                         }
    770                     }
    771 
    772                     psContext.sampleIndex = sample;
    773                     psContext.activeMask = _simd_castps_si(vCoverageMask);
    774 
    775                     // execute pixel shader
    776                     AR_BEGIN(BEPixelShader, pDC->drawId);
    777                     UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
    778                     state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
    779                     AR_END(BEPixelShader, 0);
    780 
    781                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
    782 
    783                     // late-Z
    784                     if (!T::bCanEarlyZ)
    785                     {
    786                         AR_BEGIN(BELateDepthTest, pDC->drawId);
    787                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    788                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
    789                         AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
    790                         AR_END(BELateDepthTest, 0);
    791 
    792                         if (!_simd_movemask_ps(depthPassMask))
    793                         {
    794                             // need to call depth/stencil write for stencil write
    795                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    796                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    797 
    798                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    799                             continue;
    800                         }
    801                     }
    802 
    803                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
    804                     uint32_t statCount = _mm_popcnt_u32(statMask);
    805                     UPDATE_STAT_BE(DepthPassCount, statCount);
    806 
    807                     // output merger
    808                     AR_BEGIN(BEOutputMerger, pDC->drawId);
    809 #if USE_8x2_TILE_BACKEND
    810                     OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
    811 #else
    812                     OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
    813 #endif
    814 
    815                     // do final depth write after all pixel kills
    816                     if (!state.psState.forceEarlyZ)
    817                     {
    818                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    819                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    820                     }
    821                     AR_END(BEOutputMerger, 0);
    822                 }
    823                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    824             }
    825 
    826 Endtile:
    827             ATTR_UNUSED;
    828 
    829             AR_BEGIN(BEEndTile, pDC->drawId);
    830 
    831             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
    832             {
    833                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    834             }
    835 
    836 #if USE_8x2_TILE_BACKEND
    837             if (useAlternateOffset)
    838             {
    839                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
    840                 {
    841                     pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    842                 }
    843             }
    844 #else
    845             for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
    846             {
    847                 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    848             }
    849 #endif
    850             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
    851             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
    852 
    853             AR_END(BEEndTile, 0);
    854 
    855             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
    856             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
    857         }
    858 
    859         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
    860         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    861     }
    862 
    863     AR_END(BESampleRateBackend, 0);
    864 }
    865 
    866 template<typename T>
    867 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
    868 {
    869     SWR_CONTEXT *pContext = pDC->pContext;
    870 
    871     AR_BEGIN(BEPixelRateBackend, pDC->drawId);
    872     AR_BEGIN(BESetup, pDC->drawId);
    873 
    874     const API_STATE &state = GetApiState(pDC);
    875 
    876     BarycentricCoeffs coeffs;
    877     SetupBarycentricCoeffs(&coeffs, work);
    878 
    879     uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
    880     SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
    881 
    882     SWR_PS_CONTEXT psContext;
    883     SetupPixelShaderContext<T>(&psContext, work);
    884 
    885     AR_END(BESetup, 0);
    886 
    887     PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
    888 
    889     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
    890     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    891 
    892     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    893 
    894     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    895     {
    896         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
    897         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
    898 
    899         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
    900 
    901         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
    902         {
    903 #if USE_8x2_TILE_BACKEND
    904             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
    905 
    906 #endif
    907             simdscalar activeLanes;
    908             if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
    909             activeLanes = vMask(work.anyCoveredSamples & MASK);
    910 
    911             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
    912             {
    913                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
    914 
    915                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
    916             }
    917 
    918             AR_BEGIN(BEBarycentric, pDC->drawId);
    919 
    920             CalcPixelBarycentrics(coeffs, psContext);
    921 
    922             CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
    923 
    924             AR_END(BEBarycentric, 0);
    925 
    926             if(T::bForcedSampleCount)
    927             {
    928                 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
    929                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
    930                 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
    931             }
    932 
    933             // Early-Z?
    934             if(T::bCanEarlyZ && !T::bForcedSampleCount)
    935             {
    936                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
    937                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
    938                 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
    939             }
    940 
    941             // if we have no covered samples that passed depth at this point, go to next tile
    942             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
    943 
    944             if(state.psState.usesSourceDepth)
    945             {
    946                 AR_BEGIN(BEBarycentric, pDC->drawId);
    947                 // interpolate and quantize z
    948                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
    949                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    950                 AR_END(BEBarycentric, 0);
    951             }
    952 
    953             // pixels that are currently active
    954             psContext.activeMask = _simd_castps_si(activeLanes);
    955             psContext.oMask = T::MultisampleT::FullSampleMask();
    956 
    957             // execute pixel shader
    958             AR_BEGIN(BEPixelShader, pDC->drawId);
    959             state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
    960             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
    961             AR_END(BEPixelShader, 0);
    962 
    963             // update active lanes to remove any discarded or oMask'd pixels
    964             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
    965             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
    966 
    967             // late-Z
    968             if(!T::bCanEarlyZ && !T::bForcedSampleCount)
    969             {
    970                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
    971                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
    972                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
    973             }
    974 
    975             // if we have no covered samples that passed depth at this point, skip OM and go to next tile
    976             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
    977 
    978             // output merger
    979             // loop over all samples, broadcasting the results of the PS to all passing pixels
    980             for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
    981             {
    982                 AR_BEGIN(BEOutputMerger, pDC->drawId);
    983                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
    984                 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
    985                 simdscalar coverageMask, depthMask;
    986                 if(T::bForcedSampleCount)
    987                 {
    988                     coverageMask = depthMask = activeLanes;
    989                 }
    990                 else
    991                 {
    992                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
    993                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
    994                     if(!_simd_movemask_ps(depthMask))
    995                     {
    996                         // stencil should already have been written in early/lateZ tests
    997                         AR_END(BEOutputMerger, 0);
    998                         continue;
    999                     }
   1000                 }
   1001 
   1002                 // broadcast the results of the PS to all passing pixels
   1003 #if USE_8x2_TILE_BACKEND
   1004                 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
   1005 #else
   1006                 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
   1007 #endif
   1008 
   1009                 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
   1010                 {
   1011                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
   1012                     uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
   1013 
   1014                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
   1015                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
   1016                 }
   1017                 AR_END(BEOutputMerger, 0);
   1018             }
   1019 Endtile:
   1020             AR_BEGIN(BEEndTile, pDC->drawId);
   1021 
   1022             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
   1023             {
   1024                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1025             }
   1026 
   1027             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
   1028             {
   1029                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1030             }
   1031             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1032 
   1033 #if USE_8x2_TILE_BACKEND
   1034             if (useAlternateOffset)
   1035             {
   1036                 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
   1037                 {
   1038                     pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
   1039                 }
   1040             }
   1041 #else
   1042             for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
   1043             {
   1044                 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
   1045             }
   1046             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
   1047             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
   1048 #endif
   1049 
   1050             AR_END(BEEndTile, 0);
   1051 
   1052             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
   1053             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
   1054         }
   1055 
   1056         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
   1057         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
   1058     }
   1059 
   1060     AR_END(BEPixelRateBackend, 0);
   1061 }
   1062 // optimized backend flow with NULL PS
   1063 template<uint32_t sampleCountT>
   1064 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
   1065 {
   1066     SWR_CONTEXT *pContext = pDC->pContext;
   1067 
   1068     AR_BEGIN(BENullBackend, pDC->drawId);
   1069     ///@todo: handle center multisample pattern
   1070     typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
   1071     AR_BEGIN(BESetup, pDC->drawId);
   1072 
   1073     const API_STATE &state = GetApiState(pDC);
   1074 
   1075     BarycentricCoeffs coeffs;
   1076     SetupBarycentricCoeffs(&coeffs, work);
   1077 
   1078     uint8_t *pDepthBuffer, *pStencilBuffer;
   1079     SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
   1080 
   1081     SWR_PS_CONTEXT psContext;
   1082     // skip SetupPixelShaderContext(&psContext, ...); // not needed here
   1083 
   1084     AR_END(BESetup, 0);
   1085 
   1086     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
   1087 
   1088     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
   1089 
   1090     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
   1091     {
   1092         simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
   1093 
   1094         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
   1095 
   1096         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
   1097         {
   1098             // iterate over active samples
   1099             unsigned long sample = 0;
   1100             uint32_t sampleMask = state.blendState.sampleMask;
   1101             while (_BitScanForward(&sample, sampleMask))
   1102             {
   1103                 sampleMask &= ~(1 << sample);
   1104 
   1105                 simdmask coverageMask = work.coverageMask[sample] & MASK;
   1106 
   1107                 if (coverageMask)
   1108                 {
   1109                     // offset depth/stencil buffers current sample
   1110                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
   1111                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
   1112 
   1113                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
   1114                     {
   1115                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
   1116 
   1117                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
   1118 
   1119                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
   1120                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
   1121 
   1122                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
   1123                     }
   1124 
   1125                     AR_BEGIN(BEBarycentric, pDC->drawId);
   1126 
   1127                     // calculate per sample positions
   1128                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
   1129                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
   1130 
   1131                     CalcSampleBarycentrics(coeffs, psContext);
   1132 
   1133                     // interpolate and quantize z
   1134                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
   1135                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
   1136 
   1137                     AR_END(BEBarycentric, 0);
   1138 
   1139                     // interpolate user clip distance if available
   1140                     if (state.rastState.clipDistanceMask)
   1141                     {
   1142                         coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
   1143                     }
   1144 
   1145                     simdscalar vCoverageMask = vMask(coverageMask);
   1146                     simdscalar stencilPassMask = vCoverageMask;
   1147 
   1148                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
   1149                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
   1150                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
   1151                     AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask)));
   1152                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
   1153                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
   1154                     AR_END(BEEarlyDepthTest, 0);
   1155 
   1156                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
   1157                     uint32_t statCount = _mm_popcnt_u32(statMask);
   1158                     UPDATE_STAT_BE(DepthPassCount, statCount);
   1159                 }
   1160 
   1161 Endtile:
   1162                 ATTR_UNUSED;
   1163                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1164             }
   1165 
   1166             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
   1167             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
   1168 
   1169             vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
   1170         }
   1171 
   1172         vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
   1173     }
   1174 
   1175     AR_END(BENullBackend, 0);
   1176 }
   1177 
   1178 void InitClearTilesTable()
   1179 {
   1180     memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
   1181 
   1182     sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
   1183     sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
   1184     sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
   1185     sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
   1186     sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
   1187 }
   1188 
   1189 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
   1190 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
   1191                                      [2] // centroid
   1192                                      [2] // canEarlyZ
   1193                                      = {};
   1194 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
   1195                                        [SWR_MSAA_SAMPLE_PATTERN_COUNT]
   1196                                        [SWR_INPUT_COVERAGE_COUNT]
   1197                                        [2] // centroid
   1198                                        [2] // forcedSampleCount
   1199                                        [2] // canEarlyZ
   1200                                        = {};
   1201 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
   1202                                         [SWR_INPUT_COVERAGE_COUNT]
   1203                                         [2] // centroid
   1204                                         [2] // canEarlyZ
   1205                                         = {};
   1206 
   1207 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
   1208 // arguments to static template arguments.
   1209 template <uint32_t... ArgsT>
   1210 struct BEChooser
   1211 {
   1212     // Last Arg Terminator
   1213     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
   1214     {
   1215         switch(tArg)
   1216         {
   1217         case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
   1218         case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
   1219         case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
   1220         default:
   1221             SWR_ASSERT(0 && "Invalid backend func\n");
   1222             return nullptr;
   1223             break;
   1224         }
   1225     }
   1226 
   1227     // Recursively parse args
   1228     template <typename... TArgsT>
   1229     static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
   1230     {
   1231         switch(tArg)
   1232         {
   1233         case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
   1234         case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
   1235         default:
   1236         SWR_ASSERT(0 && "Invalid sample pattern\n");
   1237         return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
   1238         break;
   1239         }
   1240     }
   1241 
   1242     // Recursively parse args
   1243     template <typename... TArgsT>
   1244     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
   1245     {
   1246         switch(tArg)
   1247         {
   1248         case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
   1249         case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
   1250         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
   1251         default:
   1252         SWR_ASSERT(0 && "Invalid sample pattern\n");
   1253         return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
   1254         break;
   1255         }
   1256     }
   1257 
   1258     // Recursively parse args
   1259     template <typename... TArgsT>
   1260     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
   1261     {
   1262         switch(tArg)
   1263         {
   1264         case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
   1265         case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
   1266         case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
   1267         case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
   1268         case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
   1269         default:
   1270         SWR_ASSERT(0 && "Invalid sample count\n");
   1271         return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
   1272         break;
   1273         }
   1274     }
   1275 
   1276     // Recursively parse args
   1277     template <typename... TArgsT>
   1278     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
   1279     {
   1280         if(tArg == true)
   1281         {
   1282             return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
   1283         }
   1284 
   1285         return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
   1286     }
   1287 };
   1288 
   1289 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
   1290 {
   1291     for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
   1292     {
   1293         for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
   1294         {
   1295             for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
   1296             {
   1297                 table[inputCoverage][isCentroid][canEarlyZ] =
   1298                     BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
   1299                                          (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
   1300             }
   1301         }
   1302     }
   1303 }
   1304 
   1305 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
   1306 {
   1307     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
   1308     {
   1309         for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
   1310         {
   1311             for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
   1312             {
   1313                 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
   1314                 {
   1315                     for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
   1316                     {
   1317                         for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
   1318                         {
   1319                             table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
   1320                                 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage,
   1321                                                         (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
   1322                         }
   1323                     }
   1324                 }
   1325             }
   1326         }
   1327     }
   1328 }
   1329 
   1330 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
   1331 {
   1332     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
   1333     {
   1334         for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
   1335         {
   1336             for(uint32_t centroid = 0; centroid < 2; centroid++)
   1337             {
   1338                 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
   1339                 {
   1340                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
   1341                         BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
   1342                                              (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
   1343                 }
   1344             }
   1345         }
   1346     }
   1347 }
   1348 
   1349 void InitBackendFuncTables()
   1350 {
   1351     InitBackendSingleFuncTable(gBackendSingleSample);
   1352     InitBackendPixelFuncTable(gBackendPixelRateTable);
   1353     InitBackendSampleFuncTable(gBackendSampleRateTable);
   1354 
   1355     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
   1356     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
   1357     gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
   1358     gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
   1359     gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
   1360 }
   1361