Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file backend.h
     24 *
     25 * @brief Backend handles rasterization, pixel shading and output merger
     26 *        operations.
     27 *
     28 ******************************************************************************/
     29 #pragma once
     30 
     31 #include "common/os.h"
     32 #include "core/context.h"
     33 #include "core/multisample.h"
     34 #include "rdtsc_core.h"
     35 
     36 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
     37 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
     38 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
     39 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
     40 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
     41 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
     42 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
     43 void InitClearTilesTable();
     44 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
     45 void InitBackendFuncTables();
     46 void InitCPSFuncTables();
     47 void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
     48 
     49 enum SWR_BACKEND_FUNCS
     50 {
     51     SWR_BACKEND_SINGLE_SAMPLE,
     52     SWR_BACKEND_MSAA_PIXEL_RATE,
     53     SWR_BACKEND_MSAA_SAMPLE_RATE,
     54     SWR_BACKEND_FUNCS_MAX,
     55 };
     56 
     57 #if KNOB_SIMD_WIDTH == 8
     58 extern const __m256 vCenterOffsetsX;
     59 extern const __m256 vCenterOffsetsY;
     60 extern const __m256 vULOffsetsX;
     61 extern const __m256 vULOffsetsY;
     62 #define MASK 0xff
     63 #endif
     64 
     65 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
     66 {
     67     static const uint32_t RasterTileColorOffsets[16]
     68     { 0,
     69       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
     70       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
     71       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
     72       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
     73       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
     74       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
     75       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
     76       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
     77       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
     78       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
     79       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
     80       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
     81       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
     82       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
     83       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
     84     };
     85     assert(sampleNum < 16);
     86     return RasterTileColorOffsets[sampleNum];
     87 }
     88 
     89 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
     90 {
     91     static const uint32_t RasterTileDepthOffsets[16]
     92     { 0,
     93       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
     94       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
     95       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
     96       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
     97       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
     98       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
     99       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
    100       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
    101       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
    102       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
    103       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
    104       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
    105       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
    106       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
    107       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
    108     };
    109     assert(sampleNum < 16);
    110     return RasterTileDepthOffsets[sampleNum];
    111 }
    112 
    113 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
    114 {
    115     static const uint32_t RasterTileStencilOffsets[16]
    116     { 0,
    117       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
    118       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
    119       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
    120       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
    121       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
    122       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
    123       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
    124       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
    125       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
    126       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
    127       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
    128       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
    129       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
    130       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
    131       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
    132     };
    133     assert(sampleNum < 16);
    134     return RasterTileStencilOffsets[sampleNum];
    135 }
    136 
    137 template<typename T, uint32_t InputCoverage>
    138 struct generateInputCoverage
    139 {
    140     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
    141     {
    142         // will need to update for avx512
    143         assert(KNOB_SIMD_WIDTH == 8);
    144 
    145         __m256i mask[2];
    146         __m256i sampleCoverage[2];
    147         if(T::bIsStandardPattern)
    148         {
    149             __m256i src = _mm256_set1_epi32(0);
    150             __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
    151 
    152             if(T::MultisampleT::numSamples == 1)
    153             {
    154                 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
    155             }
    156             else if(T::MultisampleT::numSamples == 2)
    157             {
    158                 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
    159             }
    160             else if(T::MultisampleT::numSamples == 4)
    161             {
    162                 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
    163             }
    164             else if(T::MultisampleT::numSamples == 8)
    165             {
    166                 mask[0] = _mm256_set1_epi32(-1);
    167             }
    168             else if(T::MultisampleT::numSamples == 16)
    169             {
    170                 mask[0] = _mm256_set1_epi32(-1);
    171                 mask[1] = _mm256_set1_epi32(-1);
    172                 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
    173             }
    174 
    175             // gather coverage for samples 0-7
    176             sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
    177             if(T::MultisampleT::numSamples > 8)
    178             {
    179                 // gather coverage for samples 8-15
    180                 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
    181             }
    182         }
    183         else
    184         {
    185             // center coverage is the same for all samples; just broadcast to the sample slots
    186             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
    187             if(T::MultisampleT::numSamples == 1)
    188             {
    189                 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
    190             }
    191             else if(T::MultisampleT::numSamples == 2)
    192             {
    193                 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
    194             }
    195             else if(T::MultisampleT::numSamples == 4)
    196             {
    197                 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
    198             }
    199             else if(T::MultisampleT::numSamples == 8)
    200             {
    201                 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
    202             }
    203             else if(T::MultisampleT::numSamples == 16)
    204             {
    205                 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
    206                 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
    207             }
    208         }
    209 
    210         mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
    211                                   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
    212         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
    213         __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
    214 
    215         __m256i packedCoverage1;
    216         if(T::MultisampleT::numSamples > 8)
    217         {
    218             // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
    219             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
    220         }
    221 
    222     #if (KNOB_ARCH == KNOB_ARCH_AVX)
    223         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
    224         __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
    225         __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
    226         packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
    227 
    228         __m256i packedSampleCoverage;
    229         if(T::MultisampleT::numSamples > 8)
    230         {
    231             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
    232             hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
    233             shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
    234             shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
    235             packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
    236             packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
    237         }
    238         else
    239         {
    240             packedSampleCoverage = packedCoverage0;
    241         }
    242     #else
    243         __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
    244         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
    245         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
    246 
    247         __m256i packedSampleCoverage;
    248         if(T::MultisampleT::numSamples > 8)
    249         {
    250             permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
    251             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
    252             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
    253 
    254             // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
    255             packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
    256         }
    257         else
    258         {
    259             packedSampleCoverage = packedCoverage0;
    260         }
    261     #endif
    262 
    263         for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
    264         {
    265             // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
    266             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
    267 
    268             if(!T::bForcedSampleCount)
    269             {
    270                 // input coverage has to be anded with sample mask if MSAA isn't forced on
    271                 inputMask[i] &= sampleMask;
    272             }
    273 
    274             // shift to the next pixel in the 4x2
    275             packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
    276         }
    277     }
    278 
    279     INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
    280     {
    281         uint32_t inputMask[KNOB_SIMD_WIDTH];
    282         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
    283         inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
    284     }
    285 
    286 };
    287 
    288 template<typename T>
    289 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
    290 {
    291     INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
    292     {
    293         // will need to update for avx512
    294         assert(KNOB_SIMD_WIDTH == 8);
    295         __m256i vec = _mm256_set1_epi32(coverageMask[0]);
    296         const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
    297         vec = _simd_and_si(vec, bit);
    298         vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
    299         vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
    300         inputCoverage = _simd_castsi_ps(vec);
    301     }
    302 
    303     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
    304     {
    305         uint32_t simdCoverage = (coverageMask[0] & MASK);
    306         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
    307         for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
    308         {
    309             // set all samples to covered if conservative coverage mask is set for that pixel
    310             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
    311         }
    312     }
    313 };
    314 
    315 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    316 // Centroid behaves exactly as follows :
    317 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
    318 //     have a sample location there).
    319 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
    320 //     coverage with the SampleMask Rasterizer State.
    321 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
    322 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
    323 //     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
    324 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    325 template<typename T>
    326 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
    327                             const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
    328 {
    329     uint32_t inputMask[KNOB_SIMD_WIDTH];
    330     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
    331 
    332     // Case (2) - partially covered pixel
    333 
    334     // scan for first covered sample per pixel in the 4x2 span
    335     unsigned long sampleNum[KNOB_SIMD_WIDTH];
    336     (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
    337     (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
    338     (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
    339     (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
    340     (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
    341     (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
    342     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
    343     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
    344 
    345     // look up and set the sample offsets from UL pixel corner for first covered sample
    346     __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
    347                                     T::MultisampleT::X(sampleNum[6]),
    348                                     T::MultisampleT::X(sampleNum[5]),
    349                                     T::MultisampleT::X(sampleNum[4]),
    350                                     T::MultisampleT::X(sampleNum[3]),
    351                                     T::MultisampleT::X(sampleNum[2]),
    352                                     T::MultisampleT::X(sampleNum[1]),
    353                                     T::MultisampleT::X(sampleNum[0]));
    354 
    355     __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
    356                                     T::MultisampleT::Y(sampleNum[6]),
    357                                     T::MultisampleT::Y(sampleNum[5]),
    358                                     T::MultisampleT::Y(sampleNum[4]),
    359                                     T::MultisampleT::Y(sampleNum[3]),
    360                                     T::MultisampleT::Y(sampleNum[2]),
    361                                     T::MultisampleT::Y(sampleNum[1]),
    362                                     T::MultisampleT::Y(sampleNum[0]));
    363     // add sample offset to UL pixel corner
    364     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
    365     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
    366 
    367     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
    368     static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
    369     __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
    370     __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
    371 
    372     static const __m256i vZero = _simd_setzero_si();
    373     const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
    374     __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
    375     __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
    376     __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
    377 
    378     __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
    379 
    380     // set the centroid position based on results from above
    381     psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
    382     psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
    383 
    384     // Case (3a) No samples covered and partial sample mask
    385     __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
    386     // sample mask should never be all 0's for this case, but handle it anyways
    387     unsigned long firstCoveredSampleMaskSample = 0;
    388     (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
    389 
    390     __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
    391 
    392     vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
    393     vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
    394 
    395     // blend in case 3a pixel locations
    396     psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
    397     psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
    398 }
    399 
    400 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
    401                                      const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
    402 {
    403     // evaluate I,J
    404     psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
    405     psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
    406     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
    407     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
    408 
    409     // interpolate 1/w
    410     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
    411 }
    412 
    413 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
    414 {
    415     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
    416     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
    417 
    418     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
    419 }
    420 
    421 template<typename T>
    422 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
    423 {
    424     // RT has to be single sample if we're in forcedMSAA mode
    425     if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
    426     {
    427         return 1;
    428     }
    429     // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
    430     else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
    431     {
    432         return GetNumSamples(blendSampleCount);
    433     }
    434     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
    435     else
    436     {
    437         return T::MultisampleT::numSamples;
    438     }
    439 }
    440 
    441 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
    442 {
    443     // broadcast scalars
    444 
    445     coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
    446     coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
    447     coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
    448 
    449     coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
    450     coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
    451     coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
    452 
    453     coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
    454     coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
    455     coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
    456 
    457     coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
    458 
    459     coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
    460     coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
    461     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
    462 }
    463 
    464 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
    465 {
    466     assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
    467 
    468     if (pColorBuffer)
    469     {
    470         for (uint32_t index = 0; index < colorBufferCount; index += 1)
    471         {
    472             pColorBuffer[index] = renderBuffers.pColor[index];
    473         }
    474     }
    475 
    476     if (pDepthBuffer)
    477     {
    478         *pDepthBuffer = renderBuffers.pDepth;
    479     }
    480 
    481     if (pStencilBuffer)
    482     {
    483         *pStencilBuffer = renderBuffers.pStencil;;
    484     }
    485 }
    486 
    487 template<typename T>
    488 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work)
    489 {
    490     psContext->pAttribs = work.pAttribs;
    491     psContext->pPerspAttribs = work.pPerspAttribs;
    492     psContext->frontFace = work.triFlags.frontFacing;
    493     psContext->primID = work.triFlags.primID;
    494 
    495     // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
    496     psContext->I = work.I;
    497     psContext->J = work.J;
    498 
    499     psContext->recipDet = work.recipDet;
    500     psContext->pRecipW = work.pRecipW;
    501     psContext->pSamplePosX = reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
    502     psContext->pSamplePosY = reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
    503     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
    504     psContext->sampleIndex = 0;
    505 }
    506 
    507 template<typename T, bool IsSingleSample>
    508 void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
    509 {
    510     if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
    511     {
    512         // for 1x case, centroid is pixel center
    513         psContext->vX.centroid = psContext->vX.center;
    514         psContext->vY.centroid = psContext->vY.center;
    515         psContext->vI.centroid = psContext->vI.center;
    516         psContext->vJ.centroid = psContext->vJ.center;
    517         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
    518     }
    519     else
    520     {
    521         if (T::bCentroidPos)
    522         {
    523             ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
    524             if (T::bIsStandardPattern)
    525             {
    526                 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
    527                 CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
    528             }
    529             else
    530             {
    531                 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
    532                 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
    533             }
    534 
    535             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
    536         }
    537         else
    538         {
    539             psContext->vX.centroid = psContext->vX.sample;
    540             psContext->vY.centroid = psContext->vY.sample;
    541         }
    542     }
    543 }
    544 
    545 template<typename T>
    546 struct PixelRateZTestLoop
    547 {
    548     PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
    549                        uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
    550                        pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
    551                        clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {};
    552 
    553     INLINE
    554     uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
    555                         const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
    556     {
    557         SWR_CONTEXT *pContext = pDC->pContext;
    558 
    559         uint32_t statCount = 0;
    560         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
    561         for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
    562         {
    563             const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
    564             vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
    565 
    566             if(!_simd_movemask_ps(vCoverageMask[sample]))
    567             {
    568                 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
    569                 continue;
    570             }
    571 
    572             // offset depth/stencil buffers current sample
    573             uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
    574             uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
    575 
    576             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
    577             {
    578                 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
    579 
    580                 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
    581 
    582                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
    583                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
    584 
    585                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
    586             }
    587 
    588             AR_BEGIN(BEBarycentric, pDC->drawId);
    589 
    590             // calculate per sample positions
    591             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
    592             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
    593 
    594             // calc I & J per sample
    595             CalcSampleBarycentrics(coeffs, psContext);
    596 
    597             if(psState.writesODepth)
    598             {
    599                 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
    600                 vZ[sample] = psContext.vZ;
    601             }
    602             else
    603             {
    604                 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
    605                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
    606             }
    607 
    608             AR_END(BEBarycentric, 0);
    609 
    610             ///@todo: perspective correct vs non-perspective correct clipping?
    611             // if clip distances are enabled, we need to interpolate for each sample
    612             if(clipDistanceMask)
    613             {
    614                 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
    615 
    616                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
    617             }
    618 
    619             // ZTest for this sample
    620             ///@todo Need to uncomment out this bucket.
    621             //AR_BEGIN(BEDepthBucket, pDC->drawId);
    622             depthPassMask[sample] = vCoverageMask[sample];
    623             stencilPassMask[sample] = vCoverageMask[sample];
    624             depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    625                                                      vZ[sample], pDepthSample, vCoverageMask[sample],
    626                                                      pStencilSample, &stencilPassMask[sample]);
    627             //AR_END(BEDepthBucket, 0);
    628 
    629             // early-exit if no pixels passed depth or earlyZ is forced on
    630             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
    631             {
    632                 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
    633                                   pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
    634 
    635                 if(!_simd_movemask_ps(depthPassMask[sample]))
    636                 {
    637                     continue;
    638                 }
    639             }
    640             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
    641             uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
    642             statCount += _mm_popcnt_u32(statMask);
    643         }
    644 
    645         activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
    646         // return number of samples that passed depth and coverage
    647         return statCount;
    648     }
    649 
    650     // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
    651     simdscalar vZ[T::MultisampleT::numCoverageSamples];
    652     simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
    653     simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
    654     simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
    655 
    656 private:
    657     // functor inputs
    658     DRAW_CONTEXT* pDC;
    659     uint32_t workerId;
    660 
    661     const SWR_TRIANGLE_DESC& work;
    662     const BarycentricCoeffs& coeffs;
    663     const API_STATE& state;
    664     const SWR_PS_STATE& psState;
    665     const uint8_t clipDistanceMask;
    666     uint8_t*& pDepthBuffer;
    667     uint8_t*& pStencilBuffer;
    668 };
    669 
    670 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
    671 {
    672     // evaluate I,J
    673     psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
    674     psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
    675     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
    676     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
    677 
    678     // interpolate 1/w
    679     psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
    680 }
    681 
    682 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
    683 {
    684     // evaluate I,J
    685     psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
    686     psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
    687     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
    688     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
    689 
    690     // interpolate 1/w
    691     psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
    692 }
    693 
    694 // Merge Output to 4x2 SIMD Tile Format
    695 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
    696     const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
    697 {
    698     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
    699     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
    700     simdvector blendOut;
    701 
    702     for(uint32_t rt = 0; rt < NumRT; ++rt)
    703     {
    704         uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
    705 
    706         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
    707         // pfnBlendFunc may not update all channels.  Initialize with PS output.
    708         /// TODO: move this into the blend JIT.
    709         blendOut = psContext.shaded[rt];
    710 
    711         // Blend outputs and update coverage mask for alpha test
    712         if(pfnBlendFunc[rt] != nullptr)
    713         {
    714             pfnBlendFunc[rt](
    715                 pBlendState,
    716                 psContext.shaded[rt],
    717                 psContext.shaded[1],
    718                 psContext.shaded[0].w,
    719                 sample,
    720                 pColorSample,
    721                 blendOut,
    722                 &psContext.oMask,
    723                 (simdscalari*)&coverageMask);
    724         }
    725 
    726         // final write mask
    727         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
    728 
    729         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
    730         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
    731 
    732         const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
    733 
    734         // store with color mask
    735         if(!pRTBlend->writeDisableRed)
    736         {
    737             _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
    738         }
    739         if(!pRTBlend->writeDisableGreen)
    740         {
    741             _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
    742         }
    743         if(!pRTBlend->writeDisableBlue)
    744         {
    745             _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
    746         }
    747         if(!pRTBlend->writeDisableAlpha)
    748         {
    749             _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
    750         }
    751     }
    752 }
    753 
    754 #if USE_8x2_TILE_BACKEND
    755 // Merge Output to 8x2 SIMD16 Tile Format
    756 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
    757     const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
    758 {
    759     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
    760     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
    761 
    762     if (useAlternateOffset)
    763     {
    764         rasterTileColorOffset += sizeof(simdscalar);
    765     }
    766 
    767     simdvector blendSrc;
    768     simdvector blendOut;
    769 
    770     uint32_t colorBufferBit = 1;
    771     for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
    772     {
    773         simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
    774 
    775         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
    776         // pfnBlendFunc may not update all channels.  Initialize with PS output.
    777         /// TODO: move this into the blend JIT.
    778         blendOut = psContext.shaded[rt];
    779 
    780         if (colorBufferBit & colorBufferEnableMask)
    781         {
    782             blendSrc[0] = pColorSample[0];
    783             blendSrc[1] = pColorSample[2];
    784             blendSrc[2] = pColorSample[4];
    785             blendSrc[3] = pColorSample[6];
    786         }
    787 
    788         // Blend outputs and update coverage mask for alpha test
    789         if (pfnBlendFunc[rt] != nullptr)
    790         {
    791             pfnBlendFunc[rt](
    792                 pBlendState,
    793                 psContext.shaded[rt],
    794                 psContext.shaded[1],
    795                 psContext.shaded[0].w,
    796                 sample,
    797                 reinterpret_cast<uint8_t *>(&blendSrc),
    798                 blendOut,
    799                 &psContext.oMask,
    800                 reinterpret_cast<simdscalari *>(&coverageMask));
    801         }
    802 
    803         // final write mask
    804         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
    805 
    806         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
    807         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
    808 
    809         // store with color mask
    810         if (!pRTBlend->writeDisableRed)
    811         {
    812             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
    813         }
    814         if (!pRTBlend->writeDisableGreen)
    815         {
    816             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
    817         }
    818         if (!pRTBlend->writeDisableBlue)
    819         {
    820             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
    821         }
    822         if (!pRTBlend->writeDisableAlpha)
    823         {
    824             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
    825         }
    826     }
    827 }
    828 
    829 #endif
    830 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
    831          uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
    832 struct SwrBackendTraits
    833 {
    834     static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
    835     static const uint32_t InputCoverage = coverage;
    836     static const bool bCentroidPos = (centroid == 1);
    837     static const bool bForcedSampleCount = (forced == 1);
    838     static const bool bCanEarlyZ = (canEarlyZ == 1);
    839     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
    840 };
    841