Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file backend.h
     24 *
     25 * @brief Backend handles rasterization, pixel shading and output merger
     26 *        operations.
     27 *
     28 ******************************************************************************/
     29 #pragma once
     30 
     31 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
     32 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
     33 
     34 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
     35 
     36 
     37 enum SWR_BACKEND_FUNCS
     38 {
     39     SWR_BACKEND_SINGLE_SAMPLE,
     40     SWR_BACKEND_MSAA_PIXEL_RATE,
     41     SWR_BACKEND_MSAA_SAMPLE_RATE,
     42     SWR_BACKEND_FUNCS_MAX,
     43 };
     44 
     45 #if KNOB_SIMD_WIDTH == 8
     46 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
     47 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
     48 static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
     49 static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
     50 #define MASK 0xff
     51 #endif
     52 
     53 static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
     54 {
     55     simdscalar vClipMask = _simd_setzero_ps();
     56     uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
     57 
     58     for (uint32_t i = 0; i < numClipDistance; ++i)
     59     {
     60         // pull triangle clip distance values from clip buffer
     61         simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
     62         simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
     63         simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
     64 
     65         // interpolate
     66         simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
     67 
     68         // clip if interpolated clip distance is < 0 || NAN
     69         simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
     70 
     71         vClipMask = _simd_or_ps(vClipMask, vCull);
     72     }
     73 
     74     return _simd_movemask_ps(vClipMask);
     75 }
     76 
     77 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
     78 {
     79     static const uint32_t RasterTileColorOffsets[16]
     80     { 0,
     81       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
     82       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
     83       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
     84       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
     85       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
     86       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
     87       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
     88       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
     89       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
     90       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
     91       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
     92       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
     93       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
     94       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
     95       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
     96     };
     97     assert(sampleNum < 16);
     98     return RasterTileColorOffsets[sampleNum];
     99 }
    100 
    101 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
    102 {
    103     static const uint32_t RasterTileDepthOffsets[16]
    104     { 0,
    105       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
    106       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
    107       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
    108       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
    109       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
    110       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
    111       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
    112       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
    113       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
    114       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
    115       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
    116       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
    117       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
    118       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
    119       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
    120     };
    121     assert(sampleNum < 16);
    122     return RasterTileDepthOffsets[sampleNum];
    123 }
    124 
    125 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
    126 {
    127     static const uint32_t RasterTileStencilOffsets[16]
    128     { 0,
    129       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
    130       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
    131       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
    132       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
    133       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
    134       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
    135       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
    136       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
    137       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
    138       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
    139       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
    140       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
    141       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
    142       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
    143       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
    144     };
    145     assert(sampleNum < 16);
    146     return RasterTileStencilOffsets[sampleNum];
    147 }
    148 
    149 template<typename T, uint32_t InputCoverage>
    150 struct generateInputCoverage
    151 {
    152     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
    153     {
    154         // will need to update for avx512
    155         assert(KNOB_SIMD_WIDTH == 8);
    156 
    157         simdscalari mask[2];
    158         simdscalari sampleCoverage[2];
    159 
    160         if(T::bIsCenterPattern)
    161         {
    162             // center coverage is the same for all samples; just broadcast to the sample slots
    163             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
    164             if(T::MultisampleT::numSamples == 1)
    165             {
    166                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
    167             }
    168             else if(T::MultisampleT::numSamples == 2)
    169             {
    170                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
    171             }
    172             else if(T::MultisampleT::numSamples == 4)
    173             {
    174                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
    175             }
    176             else if(T::MultisampleT::numSamples == 8)
    177             {
    178                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
    179             }
    180             else if(T::MultisampleT::numSamples == 16)
    181             {
    182                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
    183                 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
    184             }
    185         }
    186         else
    187         {
    188             simdscalari src = _simd_set1_epi32(0);
    189             simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
    190 
    191             if(T::MultisampleT::numSamples == 1)
    192             {
    193                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
    194             }
    195             else if(T::MultisampleT::numSamples == 2)
    196             {
    197                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
    198             }
    199             else if(T::MultisampleT::numSamples == 4)
    200             {
    201                 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
    202             }
    203             else if(T::MultisampleT::numSamples == 8)
    204             {
    205                 mask[0] = _simd_set1_epi32(-1);
    206             }
    207             else if(T::MultisampleT::numSamples == 16)
    208             {
    209                 mask[0] = _simd_set1_epi32(-1);
    210                 mask[1] = _simd_set1_epi32(-1);
    211                 index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
    212             }
    213 
    214             // gather coverage for samples 0-7
    215             sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
    216             if(T::MultisampleT::numSamples > 8)
    217             {
    218                 // gather coverage for samples 8-15
    219                 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
    220             }
    221         }
    222 
    223         mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
    224                                   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
    225         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
    226         simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
    227 
    228         simdscalari packedCoverage1;
    229         if(T::MultisampleT::numSamples > 8)
    230         {
    231             // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
    232             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
    233         }
    234 
    235     #if (KNOB_ARCH == KNOB_ARCH_AVX)
    236         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
    237         simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
    238         simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
    239         packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
    240 
    241         simdscalari packedSampleCoverage;
    242         if(T::MultisampleT::numSamples > 8)
    243         {
    244             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
    245             hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
    246             shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
    247             shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
    248             packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
    249             packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
    250         }
    251         else
    252         {
    253             packedSampleCoverage = packedCoverage0;
    254         }
    255     #else
    256         simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
    257         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
    258         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
    259 
    260         simdscalari packedSampleCoverage;
    261         if(T::MultisampleT::numSamples > 8)
    262         {
    263             permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
    264             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
    265             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
    266 
    267             // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
    268             packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
    269         }
    270         else
    271         {
    272             packedSampleCoverage = packedCoverage0;
    273         }
    274     #endif
    275 
    276         for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
    277         {
    278             // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
    279             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
    280 
    281             if(!T::bForcedSampleCount)
    282             {
    283                 // input coverage has to be anded with sample mask if MSAA isn't forced on
    284                 inputMask[i] &= sampleMask;
    285             }
    286 
    287             // shift to the next pixel in the 4x2
    288             packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
    289         }
    290     }
    291 
    292     INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
    293     {
    294         uint32_t inputMask[KNOB_SIMD_WIDTH];
    295         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
    296         inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
    297     }
    298 
    299 };
    300 
    301 template<typename T>
    302 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
    303 {
    304     INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
    305     {
    306         // will need to update for avx512
    307         assert(KNOB_SIMD_WIDTH == 8);
    308         simdscalari vec = _simd_set1_epi32(coverageMask[0]);
    309         const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
    310         vec = _simd_and_si(vec, bit);
    311         vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
    312         vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
    313         inputCoverage = _simd_castsi_ps(vec);
    314     }
    315 
    316     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
    317     {
    318         uint32_t simdCoverage = (coverageMask[0] & MASK);
    319         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
    320         for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
    321         {
    322             // set all samples to covered if conservative coverage mask is set for that pixel
    323             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
    324         }
    325     }
    326 };
    327 
    328 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    329 // Centroid behaves exactly as follows :
    330 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
    331 //     have a sample location there).
    332 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
    333 //     coverage with the SampleMask Rasterizer State.
    334 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
    335 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
    336 //     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
    337 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    338 template<typename T>
    339 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
    340                             const uint64_t *const coverageMask, const uint32_t sampleMask,
    341                             simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
    342 {
    343     uint32_t inputMask[KNOB_SIMD_WIDTH];
    344     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
    345 
    346     // Case (2) - partially covered pixel
    347 
    348     // scan for first covered sample per pixel in the 4x2 span
    349     unsigned long sampleNum[KNOB_SIMD_WIDTH];
    350     (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
    351     (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
    352     (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
    353     (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
    354     (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
    355     (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
    356     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
    357     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
    358 
    359     // look up and set the sample offsets from UL pixel corner for first covered sample
    360     simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
    361                                     samplePos.X(sampleNum[6]),
    362                                     samplePos.X(sampleNum[5]),
    363                                     samplePos.X(sampleNum[4]),
    364                                     samplePos.X(sampleNum[3]),
    365                                     samplePos.X(sampleNum[2]),
    366                                     samplePos.X(sampleNum[1]),
    367                                     samplePos.X(sampleNum[0]));
    368 
    369     simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
    370                                     samplePos.Y(sampleNum[6]),
    371                                     samplePos.Y(sampleNum[5]),
    372                                     samplePos.Y(sampleNum[4]),
    373                                     samplePos.Y(sampleNum[3]),
    374                                     samplePos.Y(sampleNum[2]),
    375                                     samplePos.Y(sampleNum[1]),
    376                                     samplePos.Y(sampleNum[0]));
    377     // add sample offset to UL pixel corner
    378     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
    379     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
    380 
    381     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
    382     static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
    383     simdscalari vInputCoveragei =  _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
    384     simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
    385 
    386     static const simdscalari vZero = _simd_setzero_si();
    387     const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
    388     simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
    389     simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
    390     simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
    391 
    392     simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
    393 
    394     // set the centroid position based on results from above
    395     psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
    396     psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
    397 
    398     // Case (3a) No samples covered and partial sample mask
    399     simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
    400     // sample mask should never be all 0's for this case, but handle it anyways
    401     unsigned long firstCoveredSampleMaskSample = 0;
    402     (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
    403 
    404     simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
    405 
    406     vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
    407     vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
    408 
    409     // blend in case 3a pixel locations
    410     psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
    411     psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
    412 }
    413 
    414 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
    415                                      const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
    416 {
    417     // evaluate I,J
    418     psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
    419     psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
    420     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
    421     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
    422 
    423     // interpolate 1/w
    424     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
    425 }
    426 
    427 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
    428 {
    429     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
    430     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
    431 
    432     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
    433 }
    434 
    435 template<typename T>
    436 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
    437 {
    438     // RT has to be single sample if we're in forcedMSAA mode
    439     if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
    440     {
    441         return 1;
    442     }
    443     // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
    444     else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
    445     {
    446         return GetNumSamples(blendSampleCount);
    447     }
    448     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
    449     else
    450     {
    451         return T::MultisampleT::numSamples;
    452     }
    453 }
    454 
    455 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
    456 {
    457     // broadcast scalars
    458 
    459     coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
    460     coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
    461     coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
    462 
    463     coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
    464     coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
    465     coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
    466 
    467     coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
    468     coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
    469     coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
    470 
    471     coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
    472 
    473     coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
    474     coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
    475     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
    476 }
    477 
    478 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
    479 {
    480 
    481     DWORD index;
    482     while (_BitScanForward(&index, colorHotTileMask))
    483     {
    484         assert(index < SWR_NUM_RENDERTARGETS);
    485         colorHotTileMask &= ~(1 << index);
    486         pColorBuffer[index] = renderBuffers.pColor[index];
    487     }
    488 
    489     if (pDepthBuffer)
    490     {
    491         *pDepthBuffer = renderBuffers.pDepth;
    492     }
    493 
    494     if (pStencilBuffer)
    495     {
    496         *pStencilBuffer = renderBuffers.pStencil;;
    497     }
    498 }
    499 
    500 template<typename T>
    501 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
    502 {
    503     psContext->pAttribs = work.pAttribs;
    504     psContext->pPerspAttribs = work.pPerspAttribs;
    505     psContext->frontFace = work.triFlags.frontFacing;
    506     psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
    507 
    508     // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
    509     psContext->I = work.I;
    510     psContext->J = work.J;
    511 
    512     psContext->recipDet = work.recipDet;
    513     psContext->pRecipW = work.pRecipW;
    514     psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
    515     psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
    516     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
    517     psContext->sampleIndex = 0;
    518 }
    519 
    520 template<typename T, bool IsSingleSample>
    521 void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
    522                   const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
    523 {
    524     if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
    525     {
    526         // for 1x case, centroid is pixel center
    527         psContext->vX.centroid = psContext->vX.center;
    528         psContext->vY.centroid = psContext->vY.center;
    529         psContext->vI.centroid = psContext->vI.center;
    530         psContext->vJ.centroid = psContext->vJ.center;
    531         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
    532     }
    533     else
    534     {
    535         if (T::bCentroidPos)
    536         {
    537             ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
    538             if (T::bIsCenterPattern)
    539             {
    540                 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
    541                 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
    542             }
    543             else
    544             {
    545                 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
    546                 CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
    547             }
    548 
    549             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
    550         }
    551         else
    552         {
    553             psContext->vX.centroid = psContext->vX.sample;
    554             psContext->vY.centroid = psContext->vY.sample;
    555         }
    556     }
    557 }
    558 
    559 template<typename T>
    560 struct PixelRateZTestLoop
    561 {
    562     PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
    563                        uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
    564                        pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
    565                        samplePos(state.rastState.samplePositions),
    566                        clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
    567 
    568     INLINE
    569     uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
    570                         const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
    571     {
    572         SWR_CONTEXT *pContext = pDC->pContext;
    573 
    574         uint32_t statCount = 0;
    575         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
    576         for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
    577         {
    578             const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
    579             vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
    580 
    581             if(!_simd_movemask_ps(vCoverageMask[sample]))
    582             {
    583                 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
    584                 continue;
    585             }
    586 
    587             // offset depth/stencil buffers current sample
    588             uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
    589             uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
    590 
    591             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
    592             {
    593                 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
    594 
    595                 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
    596 
    597                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
    598                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
    599 
    600                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
    601             }
    602 
    603             AR_BEGIN(BEBarycentric, pDC->drawId);
    604 
    605             // calculate per sample positions
    606             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
    607             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
    608 
    609             // calc I & J per sample
    610             CalcSampleBarycentrics(coeffs, psContext);
    611 
    612             if(psState.writesODepth)
    613             {
    614                 {
    615                     // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
    616                     vZ[sample] = psContext.vZ;
    617                 }
    618             }
    619             else
    620             {
    621                 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
    622                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
    623             }
    624 
    625             AR_END(BEBarycentric, 0);
    626 
    627             ///@todo: perspective correct vs non-perspective correct clipping?
    628             // if clip distances are enabled, we need to interpolate for each sample
    629             if(clipDistanceMask)
    630             {
    631                 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
    632 
    633                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
    634             }
    635 
    636             // ZTest for this sample
    637             ///@todo Need to uncomment out this bucket.
    638             //AR_BEGIN(BEDepthBucket, pDC->drawId);
    639             depthPassMask[sample] = vCoverageMask[sample];
    640             stencilPassMask[sample] = vCoverageMask[sample];
    641             depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
    642                                                      vZ[sample], pDepthSample, vCoverageMask[sample],
    643                                                      pStencilSample, &stencilPassMask[sample]);
    644             //AR_END(BEDepthBucket, 0);
    645 
    646             // early-exit if no pixels passed depth or earlyZ is forced on
    647             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
    648             {
    649                 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
    650                                   pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
    651 
    652                 if(!_simd_movemask_ps(depthPassMask[sample]))
    653                 {
    654                     continue;
    655                 }
    656             }
    657             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
    658             uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
    659             statCount += _mm_popcnt_u32(statMask);
    660         }
    661 
    662         activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
    663         // return number of samples that passed depth and coverage
    664         return statCount;
    665     }
    666 
    667     // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
    668     simdscalar vZ[T::MultisampleT::numCoverageSamples];
    669     simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
    670     simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
    671     simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
    672 
    673 private:
    674     // functor inputs
    675     DRAW_CONTEXT* pDC;
    676     uint32_t workerId;
    677 
    678     const SWR_TRIANGLE_DESC& work;
    679     const BarycentricCoeffs& coeffs;
    680     const API_STATE& state;
    681     const SWR_PS_STATE& psState;
    682     const SWR_MULTISAMPLE_POS& samplePos;
    683     const uint8_t clipDistanceMask;
    684     uint8_t*& pDepthBuffer;
    685     uint8_t*& pStencilBuffer;
    686 };
    687 
    688 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
    689 {
    690     // evaluate I,J
    691     psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
    692     psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
    693     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
    694     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
    695 
    696     // interpolate 1/w
    697     psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
    698 }
    699 
    700 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
    701 {
    702     // evaluate I,J
    703     psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
    704     psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
    705     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
    706     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
    707 
    708     // interpolate 1/w
    709     psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
    710 }
    711 
    712 // Merge Output to 4x2 SIMD Tile Format
    713 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
    714     const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask)
    715 {
    716     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
    717     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
    718     simdvector blendOut;
    719 
    720     DWORD rt = 0;
    721     while (_BitScanForward(&rt, renderTargetMask))
    722     {
    723         renderTargetMask &= ~(1 << rt);
    724         uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
    725 
    726         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
    727 
    728         {
    729             // pfnBlendFunc may not update all channels.  Initialize with PS output.
    730             /// TODO: move this into the blend JIT.
    731             blendOut = psContext.shaded[rt];
    732 
    733             // Blend outputs and update coverage mask for alpha test
    734             if(pfnBlendFunc[rt] != nullptr)
    735             {
    736                 pfnBlendFunc[rt](
    737                     pBlendState,
    738                     psContext.shaded[rt],
    739                     psContext.shaded[1],
    740                     psContext.shaded[0].w,
    741                     sample,
    742                     pColorSample,
    743                     blendOut,
    744                     &psContext.oMask,
    745                     (simdscalari*)&coverageMask);
    746             }
    747         }
    748 
    749         // final write mask
    750         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
    751 
    752         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
    753         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
    754 
    755         const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
    756 
    757         // store with color mask
    758         if(!pRTBlend->writeDisableRed)
    759         {
    760             _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
    761         }
    762         if(!pRTBlend->writeDisableGreen)
    763         {
    764             _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
    765         }
    766         if(!pRTBlend->writeDisableBlue)
    767         {
    768             _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
    769         }
    770         if(!pRTBlend->writeDisableAlpha)
    771         {
    772             _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
    773         }
    774     }
    775 }
    776 
    777 #if USE_8x2_TILE_BACKEND
    778 // Merge Output to 8x2 SIMD16 Tile Format
    779 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
    780     const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
    781 {
    782     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
    783     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
    784 
    785     if (useAlternateOffset)
    786     {
    787         rasterTileColorOffset += sizeof(simdscalar);
    788     }
    789 
    790     simdvector blendSrc;
    791     simdvector blendOut;
    792 
    793     DWORD rt;
    794     while (_BitScanForward(&rt, renderTargetMask))
    795     {
    796         renderTargetMask &= ~(1 << rt);
    797 
    798         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
    799 
    800         simdscalar* pColorSample;
    801         bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
    802         if (hotTileEnable)
    803         {
    804             pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
    805             blendSrc[0] = pColorSample[0];
    806             blendSrc[1] = pColorSample[2];
    807             blendSrc[2] = pColorSample[4];
    808             blendSrc[3] = pColorSample[6];
    809         }
    810         else
    811         {
    812             pColorSample = nullptr;
    813         }
    814 
    815         {
    816             // pfnBlendFunc may not update all channels.  Initialize with PS output.
    817             /// TODO: move this into the blend JIT.
    818             blendOut = psContext.shaded[rt];
    819 
    820             // Blend outputs and update coverage mask for alpha test
    821             if(pfnBlendFunc[rt] != nullptr)
    822             {
    823                 pfnBlendFunc[rt](
    824                     pBlendState,
    825                     psContext.shaded[rt],
    826                     psContext.shaded[1],
    827                     psContext.shaded[0].w,
    828                     sample,
    829                     reinterpret_cast<uint8_t *>(&blendSrc),
    830                     blendOut,
    831                     &psContext.oMask,
    832                     reinterpret_cast<simdscalari *>(&coverageMask));
    833             }
    834         }
    835 
    836         // final write mask
    837         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
    838 
    839         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
    840         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
    841 
    842         // store with color mask
    843         if (!pRTBlend->writeDisableRed)
    844         {
    845             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
    846         }
    847         if (!pRTBlend->writeDisableGreen)
    848         {
    849             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
    850         }
    851         if (!pRTBlend->writeDisableBlue)
    852         {
    853             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
    854         }
    855         if (!pRTBlend->writeDisableAlpha)
    856         {
    857             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
    858         }
    859     }
    860 }
    861 
    862 #endif
    863 
    864 template<typename T>
    865 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
    866 {
    867     ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
    868 
    869 
    870     SWR_CONTEXT *pContext = pDC->pContext;
    871 
    872     AR_BEGIN(BEPixelRateBackend, pDC->drawId);
    873     AR_BEGIN(BESetup, pDC->drawId);
    874 
    875     const API_STATE &state = GetApiState(pDC);
    876 
    877     BarycentricCoeffs coeffs;
    878     SetupBarycentricCoeffs(&coeffs, work);
    879 
    880     SWR_PS_CONTEXT psContext;
    881     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    882     SetupPixelShaderContext<T>(&psContext, samplePos, work);
    883 
    884     uint8_t *pDepthBuffer, *pStencilBuffer;
    885     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
    886 
    887     AR_END(BESetup, 0);
    888 
    889     PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
    890 
    891     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
    892     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    893 
    894     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    895 
    896     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    897     {
    898         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
    899         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
    900 
    901         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
    902 
    903         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
    904         {
    905 #if USE_8x2_TILE_BACKEND
    906             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
    907 #endif
    908             simdscalar activeLanes;
    909             if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
    910             activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
    911 
    912             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
    913             {
    914                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
    915 
    916                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
    917             }
    918 
    919             AR_BEGIN(BEBarycentric, pDC->drawId);
    920 
    921             CalcPixelBarycentrics(coeffs, psContext);
    922 
    923             CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
    924 
    925             AR_END(BEBarycentric, 0);
    926 
    927             if(T::bForcedSampleCount)
    928             {
    929                 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
    930                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
    931                 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
    932             }
    933 
    934             // Early-Z?
    935             if(T::bCanEarlyZ && !T::bForcedSampleCount)
    936             {
    937                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
    938                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
    939                 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
    940             }
    941 
    942             // if we have no covered samples that passed depth at this point, go to next tile
    943             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
    944 
    945             if(state.psState.usesSourceDepth)
    946             {
    947                 AR_BEGIN(BEBarycentric, pDC->drawId);
    948                 // interpolate and quantize z
    949                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
    950                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
    951                 AR_END(BEBarycentric, 0);
    952             }
    953 
    954             // pixels that are currently active
    955             psContext.activeMask = _simd_castps_si(activeLanes);
    956             psContext.oMask = T::MultisampleT::FullSampleMask();
    957 
    958             // execute pixel shader
    959             AR_BEGIN(BEPixelShader, pDC->drawId);
    960             state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
    961             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
    962             AR_END(BEPixelShader, 0);
    963 
    964             // update active lanes to remove any discarded or oMask'd pixels
    965             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
    966             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
    967 
    968             // late-Z
    969             if(!T::bCanEarlyZ && !T::bForcedSampleCount)
    970             {
    971                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
    972                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
    973                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
    974             }
    975 
    976             // if we have no covered samples that passed depth at this point, skip OM and go to next tile
    977             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
    978 
    979             // output merger
    980             // loop over all samples, broadcasting the results of the PS to all passing pixels
    981             for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
    982             {
    983                 AR_BEGIN(BEOutputMerger, pDC->drawId);
    984                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
    985                 uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
    986                 simdscalar coverageMask, depthMask;
    987                 if(T::bForcedSampleCount)
    988                 {
    989                     coverageMask = depthMask = activeLanes;
    990                 }
    991                 else
    992                 {
    993                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
    994                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
    995                     if(!_simd_movemask_ps(depthMask))
    996                     {
    997                         // stencil should already have been written in early/lateZ tests
    998                         AR_END(BEOutputMerger, 0);
    999                         continue;
   1000                     }
   1001                 }
   1002 
   1003                 // broadcast the results of the PS to all passing pixels
   1004 #if USE_8x2_TILE_BACKEND
   1005                 OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset);
   1006 #else // USE_8x2_TILE_BACKEND
   1007                 OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask);
   1008 #endif // USE_8x2_TILE_BACKEND
   1009 
   1010                 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
   1011                 {
   1012                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
   1013                     uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
   1014 
   1015                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
   1016                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
   1017                 }
   1018                 AR_END(BEOutputMerger, 0);
   1019             }
   1020 Endtile:
   1021             AR_BEGIN(BEEndTile, pDC->drawId);
   1022 
   1023             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
   1024             {
   1025                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1026             }
   1027 
   1028             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
   1029             {
   1030                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1031             }
   1032             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
   1033 
   1034 #if USE_8x2_TILE_BACKEND
   1035             if (useAlternateOffset)
   1036             {
   1037                 DWORD rt;
   1038                 uint32_t rtMask = state.colorHottileEnable;
   1039                 while (_BitScanForward(&rt, rtMask))
   1040                 {
   1041                     rtMask &= ~(1 << rt);
   1042                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
   1043                 }
   1044             }
   1045 #else
   1046             DWORD rt;
   1047             uint32_t rtMask = state.colorHottileEnable;
   1048             while (_BitScanForward(&rt, rtMask))
   1049             {
   1050                 rtMask &= ~(1 << rt);
   1051                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
   1052             }
   1053 #endif
   1054             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
   1055             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
   1056 
   1057             AR_END(BEEndTile, 0);
   1058 
   1059             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
   1060             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
   1061         }
   1062 
   1063         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
   1064         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
   1065     }
   1066 
   1067     AR_END(BEPixelRateBackend, 0);
   1068 }
   1069 
   1070 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
   1071          uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
   1072     >
   1073 struct SwrBackendTraits
   1074 {
   1075     static const bool bIsCenterPattern = (isCenter == 1);
   1076     static const uint32_t InputCoverage = coverage;
   1077     static const bool bCentroidPos = (centroid == 1);
   1078     static const bool bForcedSampleCount = (forced == 1);
   1079     static const bool bCanEarlyZ = (canEarlyZ == 1);
   1080     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
   1081 };
   1082