Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file backend.cpp
     24 *
     25 * @brief Backend handles rasterization, pixel shading and output merger
     26 *        operations.
     27 *
     28 ******************************************************************************/
     29 
     30 #include <smmintrin.h>
     31 
     32 #include "backend.h"
     33 #include "backend_impl.h"
     34 #include "tilemgr.h"
     35 #include "memory/tilingtraits.h"
     36 #include "core/multisample.h"
     37 #include "backends/gen_BackendPixelRate.hpp"
     38 
     39 #include <algorithm>
     40 
     41 
     42 //////////////////////////////////////////////////////////////////////////
     43 /// @brief Process compute work.
     44 /// @param pDC - pointer to draw context (dispatch).
     45 /// @param workerId - The unique worker ID that is assigned to this thread.
     46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
     47 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
     48 {
     49     SWR_CONTEXT *pContext = pDC->pContext;
     50 
     51     AR_BEGIN(BEDispatch, pDC->drawId);
     52 
     53     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
     54     SWR_ASSERT(pTaskData != nullptr);
     55 
     56     // Ensure spill fill memory has been allocated.
     57     size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
     58     if (spillFillSize && pSpillFillBuffer == nullptr)
     59     {
     60         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
     61     }
     62 
     63     size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
     64     if (scratchSpaceSize && pScratchSpace == nullptr)
     65     {
     66         pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
     67     }
     68 
     69     const API_STATE& state = GetApiState(pDC);
     70 
     71     SWR_CS_CONTEXT csContext{ 0 };
     72     csContext.tileCounter = threadGroupId;
     73     csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
     74     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
     75     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     76     csContext.pTGSM = pContext->ppScratch[workerId];
     77     csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
     78     csContext.pScratchSpace = (uint8_t*)pScratchSpace;
     79     csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;
     80 
     81     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
     82 
     83     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
     84 
     85     AR_END(BEDispatch, 1);
     86 }
     87 
     88 //////////////////////////////////////////////////////////////////////////
     89 /// @brief Process shutdown.
     90 /// @param pDC - pointer to draw context (dispatch).
     91 /// @param workerId - The unique worker ID that is assigned to this thread.
     92 /// @param threadGroupId - the linear index for the thread group within the dispatch.
     93 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
     94 {
     95     // Dummy function
     96 }
     97 
     98 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
     99 {
    100     uint32_t x, y;
    101     MacroTileMgr::getTileIndices(macroTile, x, y);
    102     SWR_ASSERT(x == 0 && y == 0);
    103 }
    104 
    105 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
    106     SWR_RENDERTARGET_ATTACHMENT attachment)
    107 {
    108     SWR_CONTEXT *pContext = pDC->pContext;
    109 
    110     AR_BEGIN(BEStoreTiles, pDC->drawId);
    111 
    112     SWR_FORMAT srcFormat;
    113     switch (attachment)
    114     {
    115     case SWR_ATTACHMENT_COLOR0:
    116     case SWR_ATTACHMENT_COLOR1:
    117     case SWR_ATTACHMENT_COLOR2:
    118     case SWR_ATTACHMENT_COLOR3:
    119     case SWR_ATTACHMENT_COLOR4:
    120     case SWR_ATTACHMENT_COLOR5:
    121     case SWR_ATTACHMENT_COLOR6:
    122     case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    123     case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
    124     case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
    125     default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    126     }
    127 
    128     uint32_t x, y;
    129     MacroTileMgr::getTileIndices(macroTile, x, y);
    130 
    131     // Only need to store the hottile if it's been rendered to...
    132     HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
    133     if (pHotTile)
    134     {
    135         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
    136         if (pHotTile->state == HOTTILE_CLEAR)
    137         {
    138             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
    139             SWR_ASSERT(pfnClearTiles != nullptr);
    140 
    141             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
    142         }
    143 
    144         if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
    145         {
    146             int32_t destX = KNOB_MACROTILE_X_DIM * x;
    147             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
    148 
    149             pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
    150                 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
    151         }
    152 
    153 
    154         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
    155         {
    156             if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
    157             {
    158                 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
    159             }
    160         }
    161     }
    162     AR_END(BEStoreTiles, 1);
    163 }
    164 
    165 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
    166 {
    167     STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
    168 
    169     unsigned long rt = 0;
    170     uint32_t mask = pDesc->attachmentMask;
    171     while (_BitScanForward(&rt, mask))
    172     {
    173         mask &= ~(1 << rt);
    174         ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
    175     }
    176 }
    177 
    178 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
    179 {
    180     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
    181     SWR_CONTEXT *pContext = pDC->pContext;
    182 
    183     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
    184 
    185     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
    186     {
    187         if (pDesc->attachmentMask & (1 << i))
    188         {
    189             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
    190                 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
    191             if (pHotTile)
    192             {
    193                 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
    194             }
    195         }
    196     }
    197 }
    198 
    199 template<uint32_t sampleCountT>
    200 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
    201 {
    202     SWR_CONTEXT *pContext = pDC->pContext;
    203 
    204     AR_BEGIN(BENullBackend, pDC->drawId);
    205     ///@todo: handle center multisample pattern
    206     AR_BEGIN(BESetup, pDC->drawId);
    207 
    208     const API_STATE &state = GetApiState(pDC);
    209 
    210     BarycentricCoeffs coeffs;
    211     SetupBarycentricCoeffs(&coeffs, work);
    212 
    213     uint8_t *pDepthBuffer, *pStencilBuffer;
    214     SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
    215 
    216     SWR_PS_CONTEXT psContext;
    217     // skip SetupPixelShaderContext(&psContext, ...); // not needed here
    218 
    219     AR_END(BESetup, 0);
    220 
    221     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    222 
    223     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    224     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    225     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    226     {
    227         simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
    228 
    229         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
    230 
    231         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
    232         {
    233             // iterate over active samples
    234             unsigned long sample = 0;
    235             uint32_t sampleMask = state.blendState.sampleMask;
    236             while (_BitScanForward(&sample, sampleMask))
    237             {
    238                 sampleMask &= ~(1 << sample);
    239 
    240                 simdmask coverageMask = work.coverageMask[sample] & MASK;
    241 
    242                 if (coverageMask)
    243                 {
    244                     // offset depth/stencil buffers current sample
    245                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
    246                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
    247 
    248                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
    249                     {
    250                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
    251 
    252                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
    253 
    254                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
    255                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
    256 
    257                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
    258                     }
    259 
    260                     AR_BEGIN(BEBarycentric, pDC->drawId);
    261 
    262                     // calculate per sample positions
    263                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
    264                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
    265 
    266                     CalcSampleBarycentrics(coeffs, psContext);
    267 
    268                     // interpolate and quantize z
    269                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
    270                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    271 
    272                     AR_END(BEBarycentric, 0);
    273 
    274                     // interpolate user clip distance if available
    275                     if (state.backendState.clipDistanceMask)
    276                     {
    277                         coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
    278                     }
    279 
    280                     simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
    281                     simdscalar stencilPassMask = vCoverageMask;
    282 
    283                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
    284                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    285                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
    286                     AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
    287                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    288                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    289                     AR_END(BEEarlyDepthTest, 0);
    290 
    291                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
    292                     uint32_t statCount = _mm_popcnt_u32(statMask);
    293                     UPDATE_STAT_BE(DepthPassCount, statCount);
    294                 }
    295 
    296             Endtile:
    297                 ATTR_UNUSED;
    298                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    299             }
    300 
    301             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
    302             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
    303 
    304             vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
    305         }
    306 
    307         vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
    308     }
    309 
    310     AR_END(BENullBackend, 0);
    311 }
    312 
    313 PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
    314 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
    315 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
    316                                      [2] // centroid
    317                                      [2] // canEarlyZ
    318                                      = {};
    319 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
    320                                        [2] // isCenterPattern
    321                                        [SWR_INPUT_COVERAGE_COUNT]
    322                                        [2] // centroid
    323                                        [2] // forcedSampleCount
    324                                        [2] // canEarlyZ
    325                                        = {};
    326 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
    327                                         [SWR_INPUT_COVERAGE_COUNT]
    328                                         [2] // centroid
    329                                         [2] // canEarlyZ
    330                                         = {};
    331 
    332 void InitBackendFuncTables()
    333 {
    334     InitBackendPixelRate();
    335     InitBackendSingleFuncTable(gBackendSingleSample);
    336     InitBackendSampleFuncTable(gBackendSampleRateTable);
    337 
    338     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
    339     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
    340     gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
    341     gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
    342     gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
    343 }
    344