Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file backend.cpp
     24 *
     25 * @brief Backend handles rasterization, pixel shading and output merger
     26 *        operations.
     27 *
     28 ******************************************************************************/
     29 
     30 #include <smmintrin.h>
     31 
     32 #include "backend.h"
     33 #include "backend_impl.h"
     34 #include "tilemgr.h"
     35 #include "memory/tilingtraits.h"
     36 #include "core/multisample.h"
     37 
     38 #include <algorithm>
     39 
     40 template<typename T>
     41 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
     42 {
     43     SWR_CONTEXT *pContext = pDC->pContext;
     44 
     45     AR_BEGIN(BESingleSampleBackend, pDC->drawId);
     46     AR_BEGIN(BESetup, pDC->drawId);
     47 
     48     const API_STATE &state = GetApiState(pDC);
     49 
     50     BarycentricCoeffs coeffs;
     51     SetupBarycentricCoeffs(&coeffs, work);
     52 
     53     SWR_PS_CONTEXT psContext;
     54     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     55     SetupPixelShaderContext<T>(&psContext, samplePos, work);
     56 
     57     uint8_t *pDepthBuffer, *pStencilBuffer;
     58     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
     59 
     60     AR_END(BESetup, 1);
     61 
     62     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
     63     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     64 
     65     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
     66 
     67     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     68     {
     69         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
     70         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
     71 
     72         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
     73 
     74         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
     75         {
     76 #if USE_8x2_TILE_BACKEND
     77             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
     78 #endif
     79             simdmask coverageMask = work.coverageMask[0] & MASK;
     80 
     81             if (coverageMask)
     82             {
     83                 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
     84                 {
     85                     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
     86 
     87                     const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
     88 
     89                     const float minz = state.depthBoundsState.depthBoundsTestMinValue;
     90                     const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
     91 
     92                     coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
     93                 }
     94 
     95                 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
     96                 {
     97                     const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
     98 
     99                     generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
    100                 }
    101 
    102                 AR_BEGIN(BEBarycentric, pDC->drawId);
    103 
    104                 CalcPixelBarycentrics(coeffs, psContext);
    105 
    106                 CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
    107 
    108                 // interpolate and quantize z
    109                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
    110                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    111 
    112                 AR_END(BEBarycentric, 1);
    113 
    114                 // interpolate user clip distance if available
    115                 if (state.backendState.clipDistanceMask)
    116                 {
    117                     coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
    118                 }
    119 
    120                 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
    121                 simdscalar depthPassMask = vCoverageMask;
    122                 simdscalar stencilPassMask = vCoverageMask;
    123 
    124                 // Early-Z?
    125                 if (T::bCanEarlyZ)
    126                 {
    127                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
    128                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    129                                                      psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
    130                     AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
    131                     AR_END(BEEarlyDepthTest, 0);
    132 
    133                     // early-exit if no pixels passed depth or earlyZ is forced on
    134                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
    135                     {
    136                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    137                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
    138 
    139                         if (!_simd_movemask_ps(depthPassMask))
    140                         {
    141                             goto Endtile;
    142                         }
    143                     }
    144                 }
    145 
    146                 psContext.sampleIndex = 0;
    147                 psContext.activeMask = _simd_castps_si(vCoverageMask);
    148 
    149                 // execute pixel shader
    150                 AR_BEGIN(BEPixelShader, pDC->drawId);
    151                 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
    152                 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
    153                 AR_END(BEPixelShader, 0);
    154 
    155                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
    156 
    157                 // late-Z
    158                 if (!T::bCanEarlyZ)
    159                 {
    160                     AR_BEGIN(BELateDepthTest, pDC->drawId);
    161                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    162                                                         psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
    163                     AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
    164                     AR_END(BELateDepthTest, 0);
    165 
    166                     if (!_simd_movemask_ps(depthPassMask))
    167                     {
    168                         // need to call depth/stencil write for stencil write
    169                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    170                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
    171                         goto Endtile;
    172                     }
    173                 } else {
    174                     // for early z, consolidate discards from shader
    175                     // into depthPassMask
    176                     depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
    177                 }
    178 
    179                 uint32_t statMask = _simd_movemask_ps(depthPassMask);
    180                 uint32_t statCount = _mm_popcnt_u32(statMask);
    181                 UPDATE_STAT_BE(DepthPassCount, statCount);
    182 
    183                 // output merger
    184                 AR_BEGIN(BEOutputMerger, pDC->drawId);
    185 #if USE_8x2_TILE_BACKEND
    186                 OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
    187 #else
    188                 OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
    189 #endif
    190 
    191                 // do final depth write after all pixel kills
    192                 if (!state.psState.forceEarlyZ)
    193                 {
    194                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    195                         pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
    196                 }
    197                 AR_END(BEOutputMerger, 0);
    198             }
    199 
    200 Endtile:
    201             AR_BEGIN(BEEndTile, pDC->drawId);
    202 
    203             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    204             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
    205             {
    206                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    207             }
    208 
    209 #if USE_8x2_TILE_BACKEND
    210             if (useAlternateOffset)
    211             {
    212                 DWORD rt;
    213                 uint32_t rtMask = state.colorHottileEnable;
    214                 while(_BitScanForward(&rt, rtMask))
    215                 {
    216                     rtMask &= ~(1 << rt);
    217                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    218                 }
    219             }
    220 #else
    221             DWORD rt;
    222             uint32_t rtMask = state.colorHottileEnable;
    223             while (_BitScanForward(&rt, rtMask))
    224             {
    225                 rtMask &= ~(1 << rt);
    226                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    227             }
    228 #endif
    229             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
    230             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
    231 
    232             AR_END(BEEndTile, 0);
    233 
    234             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
    235             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
    236         }
    237 
    238         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
    239         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    240     }
    241 
    242     AR_END(BESingleSampleBackend, 0);
    243 }
    244 
    245 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
    246 // arguments to static template arguments.
    247 template <uint32_t... ArgsT>
    248 struct BEChooserSingleSample
    249 {
    250     // Last Arg Terminator
    251     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
    252     {
    253         switch(tArg)
    254         {
    255         case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
    256         case SWR_BACKEND_MSAA_PIXEL_RATE:
    257         case SWR_BACKEND_MSAA_SAMPLE_RATE:
    258         default:
    259             SWR_ASSERT(0 && "Invalid backend func\n");
    260             return nullptr;
    261             break;
    262         }
    263     }
    264 
    265     // Recursively parse args
    266     template <typename... TArgsT>
    267     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
    268     {
    269         switch(tArg)
    270         {
    271         case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
    272         case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
    273         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
    274         default:
    275         SWR_ASSERT(0 && "Invalid sample pattern\n");
    276         return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
    277         break;
    278         }
    279     }
    280 
    281     // Recursively parse args
    282     template <typename... TArgsT>
    283     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
    284     {
    285         switch(tArg)
    286         {
    287         case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
    288         case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
    289         case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
    290         case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
    291         case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
    292         default:
    293         SWR_ASSERT(0 && "Invalid sample count\n");
    294         return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
    295         break;
    296         }
    297     }
    298 
    299     // Recursively parse args
    300     template <typename... TArgsT>
    301     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
    302     {
    303         if(tArg == true)
    304         {
    305             return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
    306         }
    307 
    308         return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
    309     }
    310 };
    311 
    312 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
    313 {
    314     for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
    315     {
    316         for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
    317         {
    318             for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
    319             {
    320                 table[inputCoverage][isCentroid][canEarlyZ] =
    321                     BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
    322                                          (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
    323             }
    324         }
    325     }
    326 }
    327