Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file rasterizer.cpp
     24 *
     25 * @brief Implementation for the rasterizer.
     26 *
     27 ******************************************************************************/
     28 
     29 #include <vector>
     30 #include <algorithm>
     31 
     32 #include "rasterizer.h"
     33 #include "rdtsc_core.h"
     34 #include "backend.h"
     35 #include "utils.h"
     36 #include "frontend.h"
     37 #include "tilemgr.h"
     38 #include "memory/tilingtraits.h"
     39 
     40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
     41 
     42 template <uint32_t numSamples = 1>
     43 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
     44 template <typename RT>
     45 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
     46 template <typename RT>
     47 void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
     48 
     49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
     50 static const __m256d gMaskToVecpd[] =
     51 {
     52     MASKTOVEC(0, 0, 0, 0),
     53     MASKTOVEC(0, 0, 0, 1),
     54     MASKTOVEC(0, 0, 1, 0),
     55     MASKTOVEC(0, 0, 1, 1),
     56     MASKTOVEC(0, 1, 0, 0),
     57     MASKTOVEC(0, 1, 0, 1),
     58     MASKTOVEC(0, 1, 1, 0),
     59     MASKTOVEC(0, 1, 1, 1),
     60     MASKTOVEC(1, 0, 0, 0),
     61     MASKTOVEC(1, 0, 0, 1),
     62     MASKTOVEC(1, 0, 1, 0),
     63     MASKTOVEC(1, 0, 1, 1),
     64     MASKTOVEC(1, 1, 0, 0),
     65     MASKTOVEC(1, 1, 0, 1),
     66     MASKTOVEC(1, 1, 1, 0),
     67     MASKTOVEC(1, 1, 1, 1),
     68 };
     69 
     70 struct POS
     71 {
     72     int32_t x, y;
     73 };
     74 
     75 struct EDGE
     76 {
     77     double a, b;                // a, b edge coefficients in fix8
     78     double stepQuadX;           // step to adjacent horizontal quad in fix16
     79     double stepQuadY;           // step to adjacent vertical quad in fix16
     80     double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
     81     double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
     82 
     83     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
     84     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
     85 };
     86 
     87 //////////////////////////////////////////////////////////////////////////
     88 /// @brief rasterize a raster tile partially covered by the triangle
     89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
     90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
     91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
     92 ///        Used to step between quads when sweeping over the raster tile.
     93 template<uint32_t NumEdges, typename EdgeMaskT>
     94 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
     95 {
     96     uint64_t coverageMask = 0;
     97 
     98     __m256d vEdges[NumEdges];
     99     __m256d vStepX[NumEdges];
    100     __m256d vStepY[NumEdges];
    101 
    102     for (uint32_t e = 0; e < NumEdges; ++e)
    103     {
    104         // Step to the pixel sample locations of the 1st quad
    105         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
    106 
    107         // compute step to next quad (mul by 2 in x and y direction)
    108         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
    109         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
    110     }
    111 
    112     // fast unrolled version for 8x8 tile
    113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
    114     int edgeMask[NumEdges];
    115     uint64_t mask;
    116 
    117     auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
    118     auto update_lambda = [&](int e){mask &= edgeMask[e];};
    119     auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
    120     auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
    121     auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
    122 
    123 // evaluate which pixels in the quad are covered
    124 #define EVAL \
    125             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
    126 
    127     // update coverage mask
    128     // if edge 0 is degenerate and will be skipped; init the mask
    129 #define UPDATE_MASK(bit) \
    130             if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
    131                 mask = 0xf;\
    132             }\
    133             else{\
    134                 mask = edgeMask[0]; \
    135             }\
    136             UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
    137             coverageMask |= (mask << bit);
    138 
    139     // step in the +x direction to the next quad
    140 #define INCX \
    141             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
    142 
    143     // step in the +y direction to the next quad
    144 #define INCY \
    145             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
    146 
    147     // step in the -x direction to the next quad
    148 #define DECX \
    149             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
    150 
    151     // sweep 2x2 quad back and forth through the raster tile,
    152     // computing coverage masks for the entire tile
    153 
    154     // raster tile
    155     // 0  1  2  3  4  5  6  7
    156     // x  x
    157     // x  x ------------------>
    158     //                   x  x  |
    159     // <-----------------x  x  V
    160     // ..
    161 
    162     // row 0
    163     EVAL;
    164     UPDATE_MASK(0);
    165     INCX;
    166     EVAL;
    167     UPDATE_MASK(4);
    168     INCX;
    169     EVAL;
    170     UPDATE_MASK(8);
    171     INCX;
    172     EVAL;
    173     UPDATE_MASK(12);
    174     INCY;
    175 
    176     //row 1
    177     EVAL;
    178     UPDATE_MASK(28);
    179     DECX;
    180     EVAL;
    181     UPDATE_MASK(24);
    182     DECX;
    183     EVAL;
    184     UPDATE_MASK(20);
    185     DECX;
    186     EVAL;
    187     UPDATE_MASK(16);
    188     INCY;
    189 
    190     // row 2
    191     EVAL;
    192     UPDATE_MASK(32);
    193     INCX;
    194     EVAL;
    195     UPDATE_MASK(36);
    196     INCX;
    197     EVAL;
    198     UPDATE_MASK(40);
    199     INCX;
    200     EVAL;
    201     UPDATE_MASK(44);
    202     INCY;
    203 
    204     // row 3
    205     EVAL;
    206     UPDATE_MASK(60);
    207     DECX;
    208     EVAL;
    209     UPDATE_MASK(56);
    210     DECX;
    211     EVAL;
    212     UPDATE_MASK(52);
    213     DECX;
    214     EVAL;
    215     UPDATE_MASK(48);
    216 #else
    217     uint32_t bit = 0;
    218     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
    219     {
    220         __m256d vStartOfRowEdge[NumEdges];
    221         for (uint32_t e = 0; e < NumEdges; ++e)
    222         {
    223             vStartOfRowEdge[e] = vEdges[e];
    224         }
    225 
    226         for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
    227         {
    228             int edgeMask[NumEdges];
    229             for (uint32_t e = 0; e < NumEdges; ++e)
    230             {
    231                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
    232             }
    233 
    234             uint64_t mask = edgeMask[0];
    235             for (uint32_t e = 1; e < NumEdges; ++e)
    236             {
    237                 mask &= edgeMask[e];
    238             }
    239             coverageMask |= (mask << bit);
    240 
    241             // step to the next pixel in the x
    242             for (uint32_t e = 0; e < NumEdges; ++e)
    243             {
    244                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
    245             }
    246             bit+=4;
    247         }
    248 
    249         // step to the next row
    250         for (uint32_t e = 0; e < NumEdges; ++e)
    251         {
    252             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
    253         }
    254     }
    255 #endif
    256     return coverageMask;
    257 
    258 }
    259 // Top left rule:
    260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
    261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
    262 // Top left: a sample is in if it is a top or left edge.
    263 // Out: !(horizontal && above) = !horizontal && below
    264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
    265 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
    266 {
    267     // if vA < 0, vC--
    268     // if vA == 0 && vB < 0, vC--
    269 
    270     __m256d vEdgeOut = vEdge;
    271     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
    272 
    273     // if vA < 0 (line is not horizontal and below)
    274     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
    275 
    276     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
    277     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
    278     int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
    279     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
    280 
    281     // if either of these are true and we're on the line (edge == 0), bump it outside the line
    282     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
    283 }
    284 
    285 //////////////////////////////////////////////////////////////////////////
    286 /// @brief calculates difference in precision between the result of manh
    287 /// calculation and the edge precision, based on compile time trait values
    288 template<typename RT>
    289 constexpr int64_t ManhToEdgePrecisionAdjust()
    290 {
    291     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
    292                   "Inadequate precision of result of manh calculation ");
    293     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
    294 }
    295 
    296 //////////////////////////////////////////////////////////////////////////
    297 /// @struct adjustEdgeConservative
    298 /// @brief Primary template definition used for partially specializing
    299 /// the adjustEdgeConservative function. This struct should never
    300 /// be instantiated.
    301 /// @tparam RT: rasterizer traits
    302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
    303 template <typename RT, typename ConservativeEdgeOffsetT>
    304 struct adjustEdgeConservative
    305 {
    306     //////////////////////////////////////////////////////////////////////////
    307     /// @brief Performs calculations to adjust each edge of a triangle away
    308     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
    309     /// direction.
    310     ///
    311     /// Uncertainty regions arise from fixed point rounding, which
    312     /// can snap a vertex +/- by min fixed point value.
    313     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
    314     /// This allows the rasterizer to test for coverage only at the pixel center,
    315     /// instead of having to test individual pixel corners for conservative coverage
    316     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
    317     {
    318         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
    319         // from the pixel center (in the direction of the edge normal A/B)
    320 
    321         // edge = Ax + Bx + C - (manh/e)
    322         // manh = manhattan distance = abs(A) + abs(B)
    323         // e = absolute rounding error from snapping from float to fixed point precision
    324 
    325         // 'fixed point' multiply (in double to be avx1 friendly)
    326         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
    327         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
    328         __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
    329                                      _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
    330 
    331         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
    332                       "Inadequate precision of result of manh calculation ");
    333 
    334         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
    335         // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
    336         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
    337 
    338         // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
    339         // this allows the rasterizer to do a single conservative coverage test to see if the primitive
    340         // intersects the pixel at all
    341         vEdge = _mm256_sub_pd(vEdge, manh);
    342     };
    343 };
    344 
    345 //////////////////////////////////////////////////////////////////////////
    346 /// @brief adjustEdgeConservative specialization where no edge offset is needed
    347 template <typename RT>
    348 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
    349 {
    350     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
    351 };
    352 
    353 //////////////////////////////////////////////////////////////////////////
    354 /// @brief calculates the distance a degenerate BBox needs to be adjusted
    355 /// for conservative rast based on compile time trait values
    356 template<typename RT>
    357 constexpr int64_t ConservativeScissorOffset()
    358 {
    359     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
    360     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
    361     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
    362     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
    363     return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
    364 }
    365 
    366 //////////////////////////////////////////////////////////////////////////
    367 /// @brief Performs calculations to adjust each a vector of evaluated edges out
    368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
    369 /// direction.
    370 template <typename RT>
    371 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
    372 {
    373     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
    374     int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
    375     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
    376 };
    377 
    378 //////////////////////////////////////////////////////////////////////////
    379 /// @brief Performs calculations to adjust each a scalar evaluated edge out
    380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
    381 /// direction.
    382 template <typename RT, typename OffsetT>
    383 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
    384 {
    385     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
    386     int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
    387     return (Edge - manh);
    388 };
    389 
    390 //////////////////////////////////////////////////////////////////////////
    391 /// @brief Perform any needed adjustments to evaluated triangle edges
    392 template <typename RT, typename EdgeOffsetT>
    393 struct adjustEdgesFix16
    394 {
    395     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
    396     {
    397         static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
    398                       "Edge equation expected to be in x.16 fixed point");
    399 
    400         static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
    401 
    402         // need to apply any edge offsets before applying the top-left rule
    403         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
    404 
    405         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
    406     }
    407 };
    408 
    409 //////////////////////////////////////////////////////////////////////////
    410 /// @brief Perform top left adjustments to evaluated triangle edges
    411 template <typename RT>
    412 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
    413 {
    414     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
    415     {
    416         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
    417     }
    418 };
    419 
    420 // max(abs(dz/dx), abs(dz,dy)
    421 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
    422 {
    423     /*
    424     // evaluate i,j at (0,0)
    425     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
    426     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
    427 
    428     // evaluate i,j at (1,0)
    429     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
    430     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
    431 
    432     // compute dz/dx
    433     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
    434     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
    435     float dzdx = abs(d10 - d00);
    436 
    437     // evaluate i,j at (0,1)
    438     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
    439     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
    440 
    441     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
    442     float dzdy = abs(d01 - d00);
    443     */
    444 
    445     // optimized version of above
    446     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
    447     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
    448 
    449     return std::max(dzdx, dzdy);
    450 }
    451 
    452 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
    453 {
    454     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
    455     {
    456         return (1.0f / (1 << 24));
    457     }
    458     else if (pState->depthFormat == R16_UNORM)
    459     {
    460         return (1.0f / (1 << 16));
    461     }
    462     else
    463     {
    464         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
    465 
    466         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
    467         float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
    468         uint32_t zMaxInt = *(uint32_t*)&zMax;
    469         zMaxInt &= 0x7f800000;
    470         zMax = *(float*)&zMaxInt;
    471 
    472         return zMax * (1.0f / (1 << 23));
    473     }
    474 }
    475 
    476 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
    477 {
    478     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
    479     {
    480         return 0.0f;
    481     }
    482 
    483     float scale = pState->slopeScaledDepthBias;
    484     if (scale != 0.0f)
    485     {
    486         scale *= ComputeMaxDepthSlope(pTri);
    487     }
    488 
    489     float bias = pState->depthBias;
    490     if (!pState->depthBiasPreAdjusted)
    491     {
    492         bias *= ComputeBiasFactor(pState, pTri, z);
    493     }
    494     bias += scale;
    495 
    496     if (pState->depthBiasClamp > 0.0f)
    497     {
    498         bias = std::min(bias, pState->depthBiasClamp);
    499     }
    500     else if (pState->depthBiasClamp < 0.0f)
    501     {
    502         bias = std::max(bias, pState->depthBiasClamp);
    503     }
    504 
    505     return bias;
    506 }
    507 
    508 // Prevent DCE by writing coverage mask from rasterizer to volatile
    509 #if KNOB_ENABLE_TOSS_POINTS
    510 __declspec(thread) volatile uint64_t gToss;
    511 #endif
    512 
    513 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
    514 // try to avoid _chkstk insertions; make this thread local
    515 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
    516 
    517 INLINE
    518 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
    519 {
    520     edge.a = a;
    521     edge.b = b;
    522 
    523     // compute constant steps to adjacent quads
    524     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
    525     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
    526 
    527     // compute constant steps to adjacent raster tiles
    528     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
    529     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
    530 
    531     // compute quad offsets
    532     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
    533     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
    534 
    535     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
    536     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
    537     edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
    538 
    539     // compute raster tile offsets
    540     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
    541     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
    542 
    543     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
    544     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
    545     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
    546 }
    547 
    548 INLINE
    549 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
    550 {
    551     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
    552 }
    553 
    554 //////////////////////////////////////////////////////////////////////////
    555 /// @brief Primary template definition used for partially specializing
    556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
    557 /// corner to sample position, and test for coverage
    558 /// @tparam sampleCount: multisample count
    559 template <typename NumSamplesT>
    560 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
    561                             int32_t &mask0, int32_t &mask1, int32_t &mask2)
    562 {
    563     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
    564     // evaluate edge equations at the tile multisample bounding box
    565     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
    566     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
    567     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
    568     mask0 = _mm256_movemask_pd(vSampleBboxTest0);
    569     mask1 = _mm256_movemask_pd(vSampleBboxTest1);
    570     mask2 = _mm256_movemask_pd(vSampleBboxTest2);
    571 }
    572 
    573 //////////////////////////////////////////////////////////////////////////
    574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
    575 /// when only rasterizing a single coverage test point
    576 template <>
    577 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
    578                                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
    579 {
    580     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
    581     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
    582     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
    583 }
    584 
    585 //////////////////////////////////////////////////////////////////////////
    586 /// @struct ComputeScissorEdges
    587 /// @brief Primary template definition. Allows the function to be generically
    588 /// called. When paired with below specializations, will result in an empty
    589 /// inlined function if scissor is not enabled
    590 /// @tparam RasterScissorEdgesT: is scissor enabled?
    591 /// @tparam IsConservativeT: is conservative rast enabled?
    592 /// @tparam RT: rasterizer traits
    593 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
    594 struct ComputeScissorEdges
    595 {
    596     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
    597                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
    598 };
    599 
    600 //////////////////////////////////////////////////////////////////////////
    601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
    602 /// specialization. Instantiated when conservative rast and scissor are enabled
    603 template <typename RT>
    604 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
    605 {
    606     //////////////////////////////////////////////////////////////////////////
    607     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
    608     /// evaluate edge equations and offset them away from pixel center.
    609     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
    610                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
    611     {
    612         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
    613         SWR_RECT scissor;
    614         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
    615         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
    616         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
    617         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
    618 
    619         POS topLeft{scissor.xmin, scissor.ymin};
    620         POS bottomLeft{scissor.xmin, scissor.ymax};
    621         POS topRight{scissor.xmax, scissor.ymin};
    622         POS bottomRight{scissor.xmax, scissor.ymax};
    623 
    624         // construct 4 scissor edges in ccw direction
    625         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
    626         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
    627         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
    628         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
    629 
    630         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
    631         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
    632         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
    633         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
    634 
    635         // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
    636         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
    637         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
    638         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
    639         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
    640 
    641         // Upper left rule for scissor
    642         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
    643         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
    644     }
    645 };
    646 
    647 //////////////////////////////////////////////////////////////////////////
    648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
    649 /// specialization. Instantiated when scissor is enabled and conservative rast
    650 /// is disabled.
    651 template <typename RT>
    652 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
    653 {
    654     //////////////////////////////////////////////////////////////////////////
    655     /// @brief Compute scissor edge vectors and evaluate edge equations
    656     INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
    657                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
    658     {
    659         const SWR_RECT &scissor = scissorBBox;
    660         POS topLeft{scissor.xmin, scissor.ymin};
    661         POS bottomLeft{scissor.xmin, scissor.ymax};
    662         POS topRight{scissor.xmax, scissor.ymin};
    663         POS bottomRight{scissor.xmax, scissor.ymax};
    664 
    665         // construct 4 scissor edges in ccw direction
    666         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
    667         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
    668         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
    669         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
    670 
    671         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
    672         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
    673         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
    674         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
    675 
    676         // Upper left rule for scissor
    677         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
    678         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
    679     }
    680 };
    681 
    682 //////////////////////////////////////////////////////////////////////////
    683 /// @brief Primary function template for TrivialRejectTest. Should
    684 /// never be called, but TemplateUnroller instantiates a few unused values,
    685 /// so it calls a runtime assert instead of a static_assert.
    686 template <typename ValidEdgeMaskT>
    687 INLINE bool TrivialRejectTest(const int, const int, const int)
    688 {
    689     SWR_INVALID("Primary templated function should never be called");
    690     return false;
    691 };
    692 
    693 //////////////////////////////////////////////////////////////////////////
    694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
    695 /// and edge 1 for trivial coverage reject
    696 template <>
    697 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
    698 {
    699     return (!(mask0 && mask1)) ? true : false;
    700 };
    701 
    702 //////////////////////////////////////////////////////////////////////////
    703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
    704 /// and edge 2 for trivial coverage reject
    705 template <>
    706 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
    707 {
    708     return (!(mask0 && mask2)) ? true : false;
    709 };
    710 
    711 //////////////////////////////////////////////////////////////////////////
    712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
    713 /// and edge 2 for trivial coverage reject
    714 template <>
    715 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
    716 {
    717     return (!(mask1 && mask2)) ? true : false;
    718 };
    719 
    720 //////////////////////////////////////////////////////////////////////////
    721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
    722 /// primitive edges for trivial coverage reject
    723 template <>
    724 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
    725 {
    726     return (!(mask0 && mask1 && mask2)) ? true : false;;
    727 };
    728 
    729 //////////////////////////////////////////////////////////////////////////
    730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
    731 /// point, so return false and rasterize against conservative BBox
    732 template <>
    733 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
    734 {
    735     return false;
    736 };
    737 
    738 //////////////////////////////////////////////////////////////////////////
    739 /// @brief Primary function template for TrivialAcceptTest. Always returns
    740 /// false, since it will only be called for degenerate tris, and as such
    741 /// will never cover the entire raster tile
    742 template <typename ScissorEnableT>
    743 INLINE bool TrivialAcceptTest(const int, const int, const int)
    744 {
    745     return false;
    746 };
    747 
    748 //////////////////////////////////////////////////////////////////////////
    749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
    750 /// edge masks for a fully covered raster tile
    751 template <>
    752 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
    753 {
    754     return ((mask0 & mask1 & mask2) == 0xf);
    755 };
    756 
    757 //////////////////////////////////////////////////////////////////////////
    758 /// @brief Primary function template for GenerateSVInnerCoverage. Results
    759 /// in an empty function call if SVInnerCoverage isn't requested
    760 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
    761 struct GenerateSVInnerCoverage
    762 {
    763     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
    764 };
    765 
    766 //////////////////////////////////////////////////////////////////////////
    767 /// @brief Specialization of GenerateSVInnerCoverage where all edges
    768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
    769 /// edge values from OuterConservative to InnerConservative and rasterizes.
    770 template <typename RT>
    771 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
    772 {
    773     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
    774     {
    775         SWR_CONTEXT *pContext = pDC->pContext;
    776 
    777         double startQuadEdgesAdj[RT::NumEdgesT::value];
    778         for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
    779         {
    780             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
    781         }
    782 
    783         // not trivial accept or reject, must rasterize full tile
    784         AR_BEGIN(BERasterizePartial, pDC->drawId);
    785         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
    786         AR_END(BERasterizePartial, 0);
    787     }
    788 };
    789 
    790 //////////////////////////////////////////////////////////////////////////
    791 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
    792 /// in an empty function call if SVInnerCoverage isn't requested
    793 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
    794 struct UpdateEdgeMasksInnerConservative
    795 {
    796     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
    797                                            const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
    798 };
    799 
    800 //////////////////////////////////////////////////////////////////////////
    801 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
    802 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
    803 /// evaluated at raster tile corners to inner conservative position and
    804 /// updates edge masks
    805 template <typename RT>
    806 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
    807 {
    808     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
    809                                            const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
    810     {
    811         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
    812 
    813         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
    814         // conservative evaluated edge when adjusting the edge in for inner conservative tests
    815         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
    816         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
    817         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
    818 
    819         UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
    820     }
    821 };
    822 
    823 //////////////////////////////////////////////////////////////////////////
    824 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
    825 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
    826 /// cover an entire raster tile, set mask0 to 0 to force it down the
    827 /// rastierizePartialTile path
    828 template <typename RT, typename ValidEdgeMaskT>
    829 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
    830 {
    831     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
    832                                    const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
    833     {
    834         // set one mask to zero to force the triangle down the rastierizePartialTile path
    835         mask0 = 0;
    836     }
    837 };
    838 
    839 template <typename RT>
    840 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
    841 {
    842     SWR_CONTEXT *pContext = pDC->pContext;
    843     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
    844 #if KNOB_ENABLE_TOSS_POINTS
    845     if (KNOB_TOSS_BIN_TRIS)
    846     {
    847         return;
    848     }
    849 #endif
    850     AR_BEGIN(BERasterizeTriangle, pDC->drawId);
    851     AR_BEGIN(BETriangleSetup, pDC->drawId);
    852 
    853     const API_STATE &state = GetApiState(pDC);
    854     const SWR_RASTSTATE &rastState = state.rastState;
    855     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
    856 
    857     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
    858     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
    859 
    860     __m128 vX, vY, vZ, vRecipW;
    861 
    862     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
    863     // eg: vX = [x0 x1 x2 dc]
    864     vX = _mm_load_ps(workDesc.pTriBuffer);
    865     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
    866     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
    867     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
    868 
    869     // convert to fixed point
    870     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
    871     __m128i vXi = fpToFixedPoint(vX);
    872     __m128i vYi = fpToFixedPoint(vY);
    873 
    874     // quantize floating point position to fixed point precision
    875     // to prevent attribute creep around the triangle vertices
    876     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
    877     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
    878 
    879     // triangle setup - A and B edge equation coefs
    880     __m128 vA, vB;
    881     triangleSetupAB(vX, vY, vA, vB);
    882 
    883     __m128i vAi, vBi;
    884     triangleSetupABInt(vXi, vYi, vAi, vBi);
    885 
    886     // determinant
    887     float det = calcDeterminantInt(vAi, vBi);
    888 
    889     // Verts in Pixel Coordinate Space at this point
    890     // Det > 0 = CW winding order
    891     // Convert CW triangles to CCW
    892     if (det > 0.0)
    893     {
    894         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
    895         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
    896         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
    897         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
    898         det = -det;
    899     }
    900 
    901     __m128 vC;
    902     // Finish triangle setup - C edge coef
    903     triangleSetupC(vX, vY, vA, vB, vC);
    904 
    905     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
    906     {
    907         // If we have degenerate edge(s) to rasterize, set I and J coefs
    908         // to 0 for constant interpolation of attributes
    909         triDesc.I[0] = 0.0f;
    910         triDesc.I[1] = 0.0f;
    911         triDesc.I[2] = 0.0f;
    912         triDesc.J[0] = 0.0f;
    913         triDesc.J[1] = 0.0f;
    914         triDesc.J[2] = 0.0f;
    915 
    916         // Degenerate triangles have no area
    917         triDesc.recipDet = 0.0f;
    918     }
    919     else
    920     {
    921         // only extract coefs for 2 of the barycentrics; the 3rd can be
    922         // determined from the barycentric equation:
    923         // i + j + k = 1 <=> k = 1 - j - i
    924         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
    925         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
    926         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
    927         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
    928         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
    929         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
    930 
    931         // compute recipDet, used to calculate barycentric i and j in the backend
    932         triDesc.recipDet = 1.0f/det;
    933     }
    934 
    935     OSALIGNSIMD(float) oneOverW[4];
    936     _mm_store_ps(oneOverW, vRecipW);
    937     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
    938     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
    939     triDesc.OneOverW[2] = oneOverW[2];
    940 
    941     // calculate perspective correct coefs per vertex attrib
    942     float* pPerspAttribs = perspAttribsTLS;
    943     float* pAttribs = workDesc.pAttribs;
    944     triDesc.pPerspAttribs = pPerspAttribs;
    945     triDesc.pAttribs = pAttribs;
    946     float *pRecipW = workDesc.pTriBuffer + 12;
    947     triDesc.pRecipW = pRecipW;
    948     __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
    949     __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
    950     __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
    951     for(uint32_t i = 0; i < workDesc.numAttribs; i++)
    952     {
    953         __m128 attribA = _mm_load_ps(pAttribs);
    954         __m128 attribB = _mm_load_ps(pAttribs+=4);
    955         __m128 attribC = _mm_load_ps(pAttribs+=4);
    956         pAttribs+=4;
    957 
    958         attribA = _mm_mul_ps(attribA, vOneOverWV0);
    959         attribB = _mm_mul_ps(attribB, vOneOverWV1);
    960         attribC = _mm_mul_ps(attribC, vOneOverWV2);
    961 
    962         _mm_store_ps(pPerspAttribs, attribA);
    963         _mm_store_ps(pPerspAttribs+=4, attribB);
    964         _mm_store_ps(pPerspAttribs+=4, attribC);
    965         pPerspAttribs+=4;
    966     }
    967 
    968     // compute bary Z
    969     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
    970     OSALIGNSIMD(float) a[4];
    971     _mm_store_ps(a, vZ);
    972     triDesc.Z[0] = a[0] - a[2];
    973     triDesc.Z[1] = a[1] - a[2];
    974     triDesc.Z[2] = a[2];
    975 
    976     // add depth bias
    977     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
    978 
    979     // Calc bounding box of triangle
    980     OSALIGNSIMD(SWR_RECT) bbox;
    981     calcBoundingBoxInt(vXi, vYi, bbox);
    982 
    983     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
    984 
    985     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
    986     {
    987         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
    988         bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
    989         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
    990                    "Conservative rast degenerate handling requires a valid scissor rect");
    991     }
    992 
    993     // Intersect with scissor/viewport
    994     OSALIGNSIMD(SWR_RECT) intersect;
    995     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
    996     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
    997     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
    998     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
    999 
   1000     triDesc.triFlags = workDesc.triFlags;
   1001 
   1002     // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
   1003     uint32_t macroX, macroY;
   1004     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
   1005     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
   1006     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
   1007     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
   1008     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
   1009 
   1010     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
   1011     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
   1012     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
   1013     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
   1014 
   1015     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
   1016 
   1017     AR_END(BETriangleSetup, 0);
   1018 
   1019     // update triangle desc
   1020     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
   1021     uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
   1022     uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
   1023     uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
   1024     uint32_t numTilesX = maxTileX - minTileX + 1;
   1025     uint32_t numTilesY = maxTileY - minTileY + 1;
   1026 
   1027     if (numTilesX == 0 || numTilesY == 0)
   1028     {
   1029         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
   1030         AR_END(BERasterizeTriangle, 1);
   1031         return;
   1032     }
   1033 
   1034     AR_BEGIN(BEStepSetup, pDC->drawId);
   1035 
   1036     // Step to pixel center of top-left pixel of the triangle bbox
   1037     // Align intersect bbox (top/left) to raster tile's (top/left).
   1038     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
   1039     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
   1040 
   1041     // convenience typedef
   1042     typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
   1043 
   1044     // single sample rasterization evaluates edges at pixel center,
   1045     // multisample evaluates edges UL pixel corner and steps to each sample position
   1046     if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
   1047     {
   1048         // Add 0.5, in fixed point, to offset to pixel center
   1049         x += (FIXED_POINT_SCALE / 2);
   1050         y += (FIXED_POINT_SCALE / 2);
   1051     }
   1052 
   1053     __m128i vTopLeftX = _mm_set1_epi32(x);
   1054     __m128i vTopLeftY = _mm_set1_epi32(y);
   1055 
   1056     // evaluate edge equations at top-left pixel using 64bit math
   1057     //
   1058     // line = Ax + By + C
   1059     // solving for C:
   1060     // C = -Ax - By
   1061     // we know x0 and y0 are on the line; plug them in:
   1062     // C = -Ax0 - By0
   1063     // plug C back into line equation:
   1064     // line = Ax - By - Ax0 - By0
   1065     // line = A(x - x0) + B(y - y0)
   1066     // dX = (x-x0), dY = (y-y0)
   1067     // so all this simplifies to
   1068     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
   1069 
   1070     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
   1071     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
   1072 
   1073     // evaluate A(dx) and B(dY) for all points
   1074     __m256d vAipd = _mm256_cvtepi32_pd(vAi);
   1075     __m256d vBipd = _mm256_cvtepi32_pd(vBi);
   1076     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
   1077     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
   1078 
   1079     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
   1080     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
   1081     __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
   1082 
   1083     // apply any edge adjustments(top-left, crast, etc)
   1084     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
   1085 
   1086     // broadcast respective edge results to all lanes
   1087     double* pEdge = (double*)&vEdge;
   1088     __m256d vEdgeFix16[7];
   1089     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
   1090     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
   1091     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
   1092 
   1093     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
   1094     _mm_store_si128((__m128i*)aAi, vAi);
   1095     _mm_store_si128((__m128i*)aBi, vBi);
   1096     EDGE rastEdges[RT::NumEdgesT::value];
   1097 
   1098     // Compute and store triangle edge data
   1099     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
   1100     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
   1101     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
   1102 
   1103     // Compute and store triangle edge data if scissor needs to rasterized
   1104     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
   1105                        (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
   1106 
   1107     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
   1108     // used to for testing if entire raster tile is inside a triangle
   1109     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1110     {
   1111         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
   1112     }
   1113 
   1114     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
   1115     // step sample positions to the raster tile bbox of multisample points
   1116     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
   1117     //                             |      |
   1118     //                             |      |
   1119     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
   1120     __m256d vEdgeTileBbox[3];
   1121     if (NumCoverageSamplesT::value > 1)
   1122     {
   1123         const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
   1124         const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
   1125         const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
   1126 
   1127         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
   1128         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
   1129 
   1130         // step edge equation tests from Tile
   1131         // used to for testing if entire raster tile is inside a triangle
   1132         for (uint32_t e = 0; e < 3; ++e)
   1133         {
   1134             __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
   1135             __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
   1136             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
   1137 
   1138             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
   1139             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
   1140         }
   1141     }
   1142 
   1143     AR_END(BEStepSetup, 0);
   1144 
   1145     uint32_t tY = minTileY;
   1146     uint32_t tX = minTileX;
   1147     uint32_t maxY = maxTileY;
   1148     uint32_t maxX = maxTileX;
   1149 
   1150     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
   1151     GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
   1152     currentRenderBufferRow = renderBuffers;
   1153 
   1154     // rasterize and generate coverage masks per sample
   1155     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
   1156     {
   1157         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
   1158         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1159         {
   1160             vStartOfRowEdge[e] = vEdgeFix16[e];
   1161         }
   1162 
   1163         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
   1164         {
   1165             triDesc.anyCoveredSamples = 0;
   1166 
   1167             // is the corner of the edge outside of the raster tile? (vEdge < 0)
   1168             int mask0, mask1, mask2;
   1169             UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
   1170 
   1171             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
   1172             {
   1173                 // trivial reject, at least one edge has all 4 corners of raster tile outside
   1174                 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
   1175 
   1176                 if (!trivialReject)
   1177                 {
   1178                     // trivial accept mask
   1179                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
   1180 
   1181                     // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
   1182                     UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
   1183                         (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
   1184 
   1185                     // @todo Make this a bit smarter to allow use of trivial accept when:
   1186                     //   1) scissor/vp intersection rect is raster tile aligned
   1187                     //   2) raster tile is entirely within scissor/vp intersection rect
   1188                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
   1189                     {
   1190                         // trivial accept, all 4 corners of all 3 edges are negative
   1191                         // i.e. raster tile completely inside triangle
   1192                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
   1193                         if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
   1194                         {
   1195                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
   1196                         }
   1197                         RDTSC_EVENT(BETrivialAccept, 1, 0);
   1198                     }
   1199                     else
   1200                     {
   1201                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
   1202                         if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
   1203                         {
   1204                             // should get optimized out for single sample case (global value numbering or copy propagation)
   1205                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1206                             {
   1207                                 vEdgeAtSample[e] = vEdgeFix16[e];
   1208                             }
   1209                         }
   1210                         else
   1211                         {
   1212                             const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
   1213                             __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
   1214                             __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
   1215                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
   1216                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
   1217 
   1218                             // step edge equation tests from UL tile corner to pixel sample position
   1219                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1220                             {
   1221                                 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
   1222                                 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
   1223                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
   1224                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
   1225                             }
   1226                         }
   1227 
   1228                         double startQuadEdges[RT::NumEdgesT::value];
   1229                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
   1230                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1231                         {
   1232                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
   1233                         }
   1234 
   1235                         // not trivial accept or reject, must rasterize full tile
   1236                         AR_BEGIN(BERasterizePartial, pDC->drawId);
   1237                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
   1238                         AR_END(BERasterizePartial, 0);
   1239 
   1240                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
   1241 
   1242                         // Output SV InnerCoverage, if needed
   1243                         GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
   1244                     }
   1245                 }
   1246                 else
   1247                 {
   1248                     // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
   1249                     if(NumCoverageSamplesT::value > 1)
   1250                     {
   1251                         triDesc.coverageMask[sampleNum] = 0;
   1252                     }
   1253                     RDTSC_EVENT(BETrivialReject, 1, 0);
   1254                 }
   1255             }
   1256 
   1257 #if KNOB_ENABLE_TOSS_POINTS
   1258             if(KNOB_TOSS_RS)
   1259             {
   1260                 gToss = triDesc.coverageMask[0];
   1261             }
   1262             else
   1263 #endif
   1264             if(triDesc.anyCoveredSamples)
   1265             {
   1266                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
   1267                 // copy conservative coverage result to all samples
   1268                 if(RT::IsConservativeT::value)
   1269                 {
   1270                     auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
   1271                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
   1272                 }
   1273 
   1274                 AR_BEGIN(BEPixelBackend, pDC->drawId);
   1275                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
   1276                 AR_END(BEPixelBackend, 0);
   1277             }
   1278 
   1279             // step to the next tile in X
   1280             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1281             {
   1282                 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
   1283             }
   1284             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
   1285         }
   1286 
   1287         // step to the next tile in Y
   1288         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1289         {
   1290             vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
   1291         }
   1292         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
   1293     }
   1294 
   1295     AR_END(BERasterizeTriangle, 1);
   1296 }
   1297 
   1298 // Get pointers to hot tile memory for color RT, depth, stencil
   1299 template <uint32_t numSamples>
   1300 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
   1301 {
   1302     const API_STATE& state = GetApiState(pDC);
   1303     SWR_CONTEXT *pContext = pDC->pContext;
   1304 
   1305     uint32_t mx, my;
   1306     MacroTileMgr::getTileIndices(macroID, mx, my);
   1307     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
   1308     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
   1309 
   1310     // compute tile offset for active hottile buffers
   1311     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
   1312     uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
   1313     offset*=numSamples;
   1314 
   1315     unsigned long rtSlot = 0;
   1316     uint32_t colorHottileEnableMask = state.colorHottileEnable;
   1317     while(_BitScanForward(&rtSlot, colorHottileEnableMask))
   1318     {
   1319         HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
   1320             numSamples, renderTargetArrayIndex);
   1321         pColor->state = HOTTILE_DIRTY;
   1322         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
   1323 
   1324         colorHottileEnableMask &= ~(1 << rtSlot);
   1325     }
   1326     if(state.depthHottileEnable)
   1327     {
   1328         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
   1329         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
   1330         offset*=numSamples;
   1331         HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
   1332             numSamples, renderTargetArrayIndex);
   1333         pDepth->state = HOTTILE_DIRTY;
   1334         SWR_ASSERT(pDepth->pBuffer != nullptr);
   1335         renderBuffers.pDepth = pDepth->pBuffer + offset;
   1336     }
   1337     if(state.stencilHottileEnable)
   1338     {
   1339         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
   1340         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
   1341         offset*=numSamples;
   1342         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
   1343             numSamples, renderTargetArrayIndex);
   1344         pStencil->state = HOTTILE_DIRTY;
   1345         SWR_ASSERT(pStencil->pBuffer != nullptr);
   1346         renderBuffers.pStencil = pStencil->pBuffer + offset;
   1347     }
   1348 }
   1349 
   1350 template <typename RT>
   1351 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
   1352 {
   1353     DWORD rt = 0;
   1354     while (_BitScanForward(&rt, colorHotTileMask))
   1355     {
   1356         colorHotTileMask &= ~(1 << rt);
   1357         buffers.pColor[rt] += RT::colorRasterTileStep;
   1358     }
   1359 
   1360     buffers.pDepth += RT::depthRasterTileStep;
   1361     buffers.pStencil += RT::stencilRasterTileStep;
   1362 }
   1363 
   1364 template <typename RT>
   1365 INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
   1366 {
   1367     DWORD rt = 0;
   1368     while (_BitScanForward(&rt, colorHotTileMask))
   1369     {
   1370         colorHotTileMask &= ~(1 << rt);
   1371         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
   1372         buffers.pColor[rt] = startBufferRow.pColor[rt];
   1373     }
   1374     startBufferRow.pDepth += RT::depthRasterTileRowStep;
   1375     buffers.pDepth = startBufferRow.pDepth;
   1376 
   1377     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
   1378     buffers.pStencil = startBufferRow.pStencil;
   1379 }
   1380 
   1381