Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file backend.cpp
     24 *
     25 * @brief Backend handles rasterization, pixel shading and output merger
     26 *        operations.
     27 *
     28 ******************************************************************************/
     29 
     30 #include <smmintrin.h>
     31 
     32 #include "backend.h"
     33 #include "backend_impl.h"
     34 #include "tilemgr.h"
     35 #include "memory/tilingtraits.h"
     36 #include "core/multisample.h"
     37 
     38 #include <algorithm>
     39 
     40 template<typename T>
     41 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
     42 {
     43     SWR_CONTEXT *pContext = pDC->pContext;
     44 
     45     AR_BEGIN(BESampleRateBackend, pDC->drawId);
     46     AR_BEGIN(BESetup, pDC->drawId);
     47 
     48     const API_STATE &state = GetApiState(pDC);
     49 
     50     BarycentricCoeffs coeffs;
     51     SetupBarycentricCoeffs(&coeffs, work);
     52 
     53     SWR_PS_CONTEXT psContext;
     54     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     55     SetupPixelShaderContext<T>(&psContext, samplePos, work);
     56 
     57     uint8_t *pDepthBuffer, *pStencilBuffer;
     58     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
     59 
     60     AR_END(BESetup, 0);
     61 
     62     psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     63     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     64 
     65     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
     66 
     67     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     68     {
     69         psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
     70         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
     71 
     72         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
     73 
     74         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
     75         {
     76 #if USE_8x2_TILE_BACKEND
     77             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
     78 #endif
     79             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
     80             {
     81                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
     82 
     83                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
     84             }
     85 
     86             AR_BEGIN(BEBarycentric, pDC->drawId);
     87 
     88             CalcPixelBarycentrics(coeffs, psContext);
     89 
     90             CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
     91 
     92             AR_END(BEBarycentric, 0);
     93 
     94             for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
     95             {
     96                 simdmask coverageMask = work.coverageMask[sample] & MASK;
     97 
     98                 if (coverageMask)
     99                 {
    100                     // offset depth/stencil buffers current sample
    101                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
    102                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
    103 
    104                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
    105                     {
    106                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
    107 
    108                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
    109 
    110                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
    111                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
    112 
    113                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
    114                     }
    115 
    116                     AR_BEGIN(BEBarycentric, pDC->drawId);
    117 
    118                     // calculate per sample positions
    119                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
    120                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
    121 
    122                     CalcSampleBarycentrics(coeffs, psContext);
    123 
    124                     // interpolate and quantize z
    125                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
    126                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    127 
    128                     AR_END(BEBarycentric, 0);
    129 
    130                     // interpolate user clip distance if available
    131                     if (state.backendState.clipDistanceMask)
    132                     {
    133                         coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
    134                     }
    135 
    136                     simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
    137                     simdscalar depthPassMask = vCoverageMask;
    138                     simdscalar stencilPassMask = vCoverageMask;
    139 
    140                     // Early-Z?
    141                     if (T::bCanEarlyZ)
    142                     {
    143                         AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
    144                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    145                             psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
    146                         AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
    147                         AR_END(BEEarlyDepthTest, 0);
    148 
    149                         // early-exit if no samples passed depth or earlyZ is forced on.
    150                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
    151                         {
    152                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    153                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    154 
    155                             if (!_simd_movemask_ps(depthPassMask))
    156                             {
    157                                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    158                                 continue;
    159                             }
    160                         }
    161                     }
    162 
    163                     psContext.sampleIndex = sample;
    164                     psContext.activeMask = _simd_castps_si(vCoverageMask);
    165 
    166                     // execute pixel shader
    167                     AR_BEGIN(BEPixelShader, pDC->drawId);
    168                     UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
    169                     state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
    170                     AR_END(BEPixelShader, 0);
    171 
    172                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
    173 
    174                     // late-Z
    175                     if (!T::bCanEarlyZ)
    176                     {
    177                         AR_BEGIN(BELateDepthTest, pDC->drawId);
    178                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    179                             psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
    180                         AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
    181                         AR_END(BELateDepthTest, 0);
    182 
    183                         if (!_simd_movemask_ps(depthPassMask))
    184                         {
    185                             // need to call depth/stencil write for stencil write
    186                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    187                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    188 
    189                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    190                             continue;
    191                         }
    192                     }
    193 
    194                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
    195                     uint32_t statCount = _mm_popcnt_u32(statMask);
    196                     UPDATE_STAT_BE(DepthPassCount, statCount);
    197 
    198                     // output merger
    199                     AR_BEGIN(BEOutputMerger, pDC->drawId);
    200 #if USE_8x2_TILE_BACKEND
    201                     OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
    202 #else
    203                     OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
    204 #endif
    205 
    206                     // do final depth write after all pixel kills
    207                     if (!state.psState.forceEarlyZ)
    208                     {
    209                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
    210                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
    211                     }
    212                     AR_END(BEOutputMerger, 0);
    213                 }
    214                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    215             }
    216 
    217         Endtile:
    218             ATTR_UNUSED;
    219 
    220             AR_BEGIN(BEEndTile, pDC->drawId);
    221 
    222             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
    223             {
    224                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
    225             }
    226 
    227 #if USE_8x2_TILE_BACKEND
    228             if (useAlternateOffset)
    229             {
    230                 DWORD rt;
    231                 uint32_t rtMask = state.colorHottileEnable;
    232                 while (_BitScanForward(&rt, rtMask))
    233                 {
    234                     rtMask &= ~(1 << rt);
    235                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    236                 }
    237             }
    238 #else
    239             DWORD rt;
    240             uint32_t rtMask = state.colorHottileEnable;
    241             while (_BitScanForward(&rt, rtMask))
    242             {
    243                 rtMask &= ~(1 << rt);
    244                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
    245             }
    246 #endif
    247             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
    248             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
    249 
    250             AR_END(BEEndTile, 0);
    251 
    252             psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
    253             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
    254         }
    255 
    256         psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
    257         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    258     }
    259 
    260     AR_END(BESampleRateBackend, 0);
    261 }
    262 
    263 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
    264 // arguments to static template arguments.
    265 template <uint32_t... ArgsT>
    266 struct BEChooserSampleRate
    267 {
    268     // Last Arg Terminator
    269     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
    270     {
    271         switch (tArg)
    272         {
    273         case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
    274         case SWR_BACKEND_SINGLE_SAMPLE:
    275         case SWR_BACKEND_MSAA_PIXEL_RATE:
    276             SWR_ASSERT(0 && "Invalid backend func\n");
    277             return nullptr;
    278             break;
    279         default:
    280             SWR_ASSERT(0 && "Invalid backend func\n");
    281             return nullptr;
    282             break;
    283         }
    284     }
    285 
    286     // Recursively parse args
    287     template <typename... TArgsT>
    288     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
    289     {
    290         switch (tArg)
    291         {
    292         case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
    293         case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
    294         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
    295         default:
    296             SWR_ASSERT(0 && "Invalid sample pattern\n");
    297             return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
    298             break;
    299         }
    300     }
    301 
    302     // Recursively parse args
    303     template <typename... TArgsT>
    304     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
    305     {
    306         switch (tArg)
    307         {
    308         case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
    309         case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
    310         case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
    311         case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
    312         case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
    313         default:
    314             SWR_ASSERT(0 && "Invalid sample count\n");
    315             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
    316             break;
    317         }
    318     }
    319 
    320     // Recursively parse args
    321     template <typename... TArgsT>
    322     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
    323     {
    324         if (tArg == true)
    325         {
    326             return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
    327         }
    328 
    329         return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
    330     }
    331 };
    332 
    333 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
    334 {
    335     for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
    336     {
    337         for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
    338         {
    339             for (uint32_t centroid = 0; centroid < 2; centroid++)
    340             {
    341                 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
    342                 {
    343                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
    344                         BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
    345                         (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
    346                 }
    347             }
    348         }
    349     }
    350 }
    351