Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     21 * IN THE SOFTWARE.
     22 *
     23 * @file rasterizer.cpp
     24 *
     25 * @brief Implementation for the rasterizer.
     26 *
     27 ******************************************************************************/
     29 #include <vector>
     30 #include <algorithm>
     32 #include "rasterizer.h"
     33 #include "rdtsc_core.h"
     34 #include "backend.h"
     35 #include "utils.h"
     36 #include "frontend.h"
     37 #include "tilemgr.h"
     38 #include "memory/tilingtraits.h"
     40 template <uint32_t numSamples = 1>
     41 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
     42 template <typename RT>
     43 void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
     44 template <typename RT>
     45 void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
     47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
     48 const __m256d gMaskToVecpd[] =
     49 {
     50     MASKTOVEC(0, 0, 0, 0),
     51     MASKTOVEC(0, 0, 0, 1),
     52     MASKTOVEC(0, 0, 1, 0),
     53     MASKTOVEC(0, 0, 1, 1),
     54     MASKTOVEC(0, 1, 0, 0),
     55     MASKTOVEC(0, 1, 0, 1),
     56     MASKTOVEC(0, 1, 1, 0),
     57     MASKTOVEC(0, 1, 1, 1),
     58     MASKTOVEC(1, 0, 0, 0),
     59     MASKTOVEC(1, 0, 0, 1),
     60     MASKTOVEC(1, 0, 1, 0),
     61     MASKTOVEC(1, 0, 1, 1),
     62     MASKTOVEC(1, 1, 0, 0),
     63     MASKTOVEC(1, 1, 0, 1),
     64     MASKTOVEC(1, 1, 1, 0),
     65     MASKTOVEC(1, 1, 1, 1),
     66 };
     68 struct POS
     69 {
     70     int32_t x, y;
     71 };
     73 struct EDGE
     74 {
     75     double a, b;                // a, b edge coefficients in fix8
     76     double stepQuadX;           // step to adjacent horizontal quad in fix16
     77     double stepQuadY;           // step to adjacent vertical quad in fix16
     78     double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
     79     double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
     81     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
     82     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
     83 };
     85 //////////////////////////////////////////////////////////////////////////
     86 /// @brief rasterize a raster tile partially covered by the triangle
     87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
     88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
     89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
     90 ///        Used to step between quads when sweeping over the raster tile.
     91 template<uint32_t NumEdges, typename EdgeMaskT>
     92 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
     93 {
     94     uint64_t coverageMask = 0;
     96     __m256d vEdges[NumEdges];
     97     __m256d vStepX[NumEdges];
     98     __m256d vStepY[NumEdges];
    100     for (uint32_t e = 0; e < NumEdges; ++e)
    101     {
    102         // Step to the pixel sample locations of the 1st quad
    103         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
    105         // compute step to next quad (mul by 2 in x and y direction)
    106         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
    107         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
    108     }
    110     // fast unrolled version for 8x8 tile
    111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
    112     int edgeMask[NumEdges];
    113     uint64_t mask;
    115     auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
    116     auto update_lambda = [&](int e){mask &= edgeMask[e];};
    117     auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
    118     auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
    119     auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
    121 // evaluate which pixels in the quad are covered
    122 #define EVAL \
    123             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
    125     // update coverage mask
    126     // if edge 0 is degenerate and will be skipped; init the mask
    127 #define UPDATE_MASK(bit) \
    128             if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
    129                 mask = 0xf;\
    130             }\
    131             else{\
    132                 mask = edgeMask[0]; \
    133             }\
    134             UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
    135             coverageMask |= (mask << bit);
    137     // step in the +x direction to the next quad
    138 #define INCX \
    139             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
    141     // step in the +y direction to the next quad
    142 #define INCY \
    143             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
    145     // step in the -x direction to the next quad
    146 #define DECX \
    147             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
    149     // sweep 2x2 quad back and forth through the raster tile,
    150     // computing coverage masks for the entire tile
    152     // raster tile
    153     // 0  1  2  3  4  5  6  7
    154     // x  x
    155     // x  x ------------------>
    156     //                   x  x  |
    157     // <-----------------x  x  V
    158     // ..
    160     // row 0
    161     EVAL;
    162     UPDATE_MASK(0);
    163     INCX;
    164     EVAL;
    165     UPDATE_MASK(4);
    166     INCX;
    167     EVAL;
    168     UPDATE_MASK(8);
    169     INCX;
    170     EVAL;
    171     UPDATE_MASK(12);
    172     INCY;
    174     //row 1
    175     EVAL;
    176     UPDATE_MASK(28);
    177     DECX;
    178     EVAL;
    179     UPDATE_MASK(24);
    180     DECX;
    181     EVAL;
    182     UPDATE_MASK(20);
    183     DECX;
    184     EVAL;
    185     UPDATE_MASK(16);
    186     INCY;
    188     // row 2
    189     EVAL;
    190     UPDATE_MASK(32);
    191     INCX;
    192     EVAL;
    193     UPDATE_MASK(36);
    194     INCX;
    195     EVAL;
    196     UPDATE_MASK(40);
    197     INCX;
    198     EVAL;
    199     UPDATE_MASK(44);
    200     INCY;
    202     // row 3
    203     EVAL;
    204     UPDATE_MASK(60);
    205     DECX;
    206     EVAL;
    207     UPDATE_MASK(56);
    208     DECX;
    209     EVAL;
    210     UPDATE_MASK(52);
    211     DECX;
    212     EVAL;
    213     UPDATE_MASK(48);
    214 #else
    215     uint32_t bit = 0;
    216     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
    217     {
    218         __m256d vStartOfRowEdge[NumEdges];
    219         for (uint32_t e = 0; e < NumEdges; ++e)
    220         {
    221             vStartOfRowEdge[e] = vEdges[e];
    222         }
    224         for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
    225         {
    226             int edgeMask[NumEdges];
    227             for (uint32_t e = 0; e < NumEdges; ++e)
    228             {
    229                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
    230             }
    232             uint64_t mask = edgeMask[0];
    233             for (uint32_t e = 1; e < NumEdges; ++e)
    234             {
    235                 mask &= edgeMask[e];
    236             }
    237             coverageMask |= (mask << bit);
    239             // step to the next pixel in the x
    240             for (uint32_t e = 0; e < NumEdges; ++e)
    241             {
    242                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
    243             }
    244             bit+=4;
    245         }
    247         // step to the next row
    248         for (uint32_t e = 0; e < NumEdges; ++e)
    249         {
    250             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
    251         }
    252     }
    253 #endif
    254     return coverageMask;
    256 }
    257 // Top left rule:
    258 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
    259 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
    260 // Top left: a sample is in if it is a top or left edge.
    261 // Out: !(horizontal && above) = !horizontal && below
    262 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
    263 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
    264 {
    265     // if vA < 0, vC--
    266     // if vA == 0 && vB < 0, vC--
    268     __m256d vEdgeOut = vEdge;
    269     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
    271     // if vA < 0 (line is not horizontal and below)
    272     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
    274     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
    275     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
    276     int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
    277     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
    279     // if either of these are true and we're on the line (edge == 0), bump it outside the line
    280     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
    281 }
    283 //////////////////////////////////////////////////////////////////////////
    284 /// @brief calculates difference in precision between the result of manh
    285 /// calculation and the edge precision, based on compile time trait values
    286 template<typename RT>
    287 constexpr int64_t ManhToEdgePrecisionAdjust()
    288 {
    289     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
    290                   "Inadequate precision of result of manh calculation ");
    291     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
    292 }
    294 //////////////////////////////////////////////////////////////////////////
    295 /// @struct adjustEdgeConservative
    296 /// @brief Primary template definition used for partially specializing
    297 /// the adjustEdgeConservative function. This struct should never
    298 /// be instantiated.
    299 /// @tparam RT: rasterizer traits
    300 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
    301 template <typename RT, typename ConservativeEdgeOffsetT>
    302 struct adjustEdgeConservative
    303 {
    304     //////////////////////////////////////////////////////////////////////////
    305     /// @brief Performs calculations to adjust each edge of a triangle away
    306     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
    307     /// direction.
    308     ///
    309     /// Uncertainty regions arise from fixed point rounding, which
    310     /// can snap a vertex +/- by min fixed point value.
    311     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
    312     /// This allows the rasterizer to test for coverage only at the pixel center,
    313     /// instead of having to test individual pixel corners for conservative coverage
    314     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
    315     {
    316         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
    317         // from the pixel center (in the direction of the edge normal A/B)
    319         // edge = Ax + Bx + C - (manh/e)
    320         // manh = manhattan distance = abs(A) + abs(B)
    321         // e = absolute rounding error from snapping from float to fixed point precision
    323         // 'fixed point' multiply (in double to be avx1 friendly)
    324         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
    325         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
    326         __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
    327                                      _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
    329         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
    330                       "Inadequate precision of result of manh calculation ");
    332         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
    333         // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
    334         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
    336         // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
    337         // this allows the rasterizer to do a single conservative coverage test to see if the primitive
    338         // intersects the pixel at all
    339         vEdge = _mm256_sub_pd(vEdge, manh);
    340     };
    341 };
    343 //////////////////////////////////////////////////////////////////////////
    344 /// @brief adjustEdgeConservative specialization where no edge offset is needed
    345 template <typename RT>
    346 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
    347 {
    348     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
    349 };
    351 //////////////////////////////////////////////////////////////////////////
    352 /// @brief calculates the distance a degenerate BBox needs to be adjusted
    353 /// for conservative rast based on compile time trait values
    354 template<typename RT>
    355 constexpr int64_t ConservativeScissorOffset()
    356 {
    357     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
    358     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
    359     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
    360     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
    361     return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
    362 }
    364 //////////////////////////////////////////////////////////////////////////
    365 /// @brief Performs calculations to adjust each a vector of evaluated edges out
    366 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
    367 /// direction.
    368 template <typename RT>
    369 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
    370 {
    371     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
    372     int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
    373     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
    374 };
    376 //////////////////////////////////////////////////////////////////////////
    377 /// @brief Performs calculations to adjust each a scalar evaluated edge out
    378 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
    379 /// direction.
    380 template <typename RT, typename OffsetT>
    381 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
    382 {
    383     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
    384     int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
    385     return (Edge - manh);
    386 };
    388 //////////////////////////////////////////////////////////////////////////
    389 /// @brief Perform any needed adjustments to evaluated triangle edges
    390 template <typename RT, typename EdgeOffsetT>
    391 struct adjustEdgesFix16
    392 {
    393     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
    394     {
    395         static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
    396                       "Edge equation expected to be in x.16 fixed point");
    398         static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
    400         // need to apply any edge offsets before applying the top-left rule
    401         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
    403         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
    404     }
    405 };
    407 //////////////////////////////////////////////////////////////////////////
    408 /// @brief Perform top left adjustments to evaluated triangle edges
    409 template <typename RT>
    410 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
    411 {
    412     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
    413     {
    414         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
    415     }
    416 };
    418 // max(abs(dz/dx), abs(dz,dy)
    419 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
    420 {
    421     /*
    422     // evaluate i,j at (0,0)
    423     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
    424     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
    426     // evaluate i,j at (1,0)
    427     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
    428     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
    430     // compute dz/dx
    431     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
    432     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
    433     float dzdx = abs(d10 - d00);
    435     // evaluate i,j at (0,1)
    436     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
    437     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
    439     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
    440     float dzdy = abs(d01 - d00);
    441     */
    443     // optimized version of above
    444     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
    445     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
    447     return std::max(dzdx, dzdy);
    448 }
    450 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
    451 {
    452     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
    453     {
    454         return (1.0f / (1 << 24));
    455     }
    456     else if (pState->depthFormat == R16_UNORM)
    457     {
    458         return (1.0f / (1 << 16));
    459     }
    460     else
    461     {
    462         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
    464         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
    465         float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
    466         uint32_t zMaxInt = *(uint32_t*)&zMax;
    467         zMaxInt &= 0x7f800000;
    468         zMax = *(float*)&zMaxInt;
    470         return zMax * (1.0f / (1 << 23));
    471     }
    472 }
    474 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
    475 {
    476     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
    477     {
    478         return 0.0f;
    479     }
    481     float scale = pState->slopeScaledDepthBias;
    482     if (scale != 0.0f)
    483     {
    484         scale *= ComputeMaxDepthSlope(pTri);
    485     }
    487     float bias = pState->depthBias;
    488     if (!pState->depthBiasPreAdjusted)
    489     {
    490         bias *= ComputeBiasFactor(pState, pTri, z);
    491     }
    492     bias += scale;
    494     if (pState->depthBiasClamp > 0.0f)
    495     {
    496         bias = std::min(bias, pState->depthBiasClamp);
    497     }
    498     else if (pState->depthBiasClamp < 0.0f)
    499     {
    500         bias = std::max(bias, pState->depthBiasClamp);
    501     }
    503     return bias;
    504 }
    506 // Prevent DCE by writing coverage mask from rasterizer to volatile
    508 __declspec(thread) volatile uint64_t gToss;
    509 #endif
    511 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
    512 // try to avoid _chkstk insertions; make this thread local
    513 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
    515 INLINE
    516 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
    517 {
    518     edge.a = a;
    519     edge.b = b;
    521     // compute constant steps to adjacent quads
    522     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
    523     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
    525     // compute constant steps to adjacent raster tiles
    526     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
    527     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
    529     // compute quad offsets
    530     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
    531     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
    533     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
    534     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
    535     edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
    537     // compute raster tile offsets
    538     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
    539     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
    541     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
    542     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
    543     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
    544 }
    546 INLINE
    547 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
    548 {
    549     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
    550 }
    552 //////////////////////////////////////////////////////////////////////////
    553 /// @brief Primary template definition used for partially specializing
    554 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
    555 /// corner to sample position, and test for coverage
    556 /// @tparam sampleCount: multisample count
    557 template <typename NumSamplesT>
    558 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
    559                             int32_t &mask0, int32_t &mask1, int32_t &mask2)
    560 {
    561     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
    562     // evaluate edge equations at the tile multisample bounding box
    563     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
    564     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
    565     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
    566     mask0 = _mm256_movemask_pd(vSampleBboxTest0);
    567     mask1 = _mm256_movemask_pd(vSampleBboxTest1);
    568     mask2 = _mm256_movemask_pd(vSampleBboxTest2);
    569 }
    571 //////////////////////////////////////////////////////////////////////////
    572 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
    573 /// when only rasterizing a single coverage test point
    574 template <>
    575 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
    576                                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
    577 {
    578     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
    579     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
    580     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
    581 }
    583 //////////////////////////////////////////////////////////////////////////
    584 /// @struct ComputeScissorEdges
    585 /// @brief Primary template definition. Allows the function to be generically
    586 /// called. When paired with below specializations, will result in an empty
    587 /// inlined function if scissor is not enabled
    588 /// @tparam RasterScissorEdgesT: is scissor enabled?
    589 /// @tparam IsConservativeT: is conservative rast enabled?
    590 /// @tparam RT: rasterizer traits
    591 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
    592 struct ComputeScissorEdges
    593 {
    594     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
    595                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
    596 };
    598 //////////////////////////////////////////////////////////////////////////
    599 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
    600 /// specialization. Instantiated when conservative rast and scissor are enabled
    601 template <typename RT>
    602 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
    603 {
    604     //////////////////////////////////////////////////////////////////////////
    605     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
    606     /// evaluate edge equations and offset them away from pixel center.
    607     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
    608                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
    609     {
    610         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
    611         SWR_RECT scissor;
    612         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
    613         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
    614         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
    615         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
    617         POS topLeft{scissor.xmin, scissor.ymin};
    618         POS bottomLeft{scissor.xmin, scissor.ymax};
    619         POS topRight{scissor.xmax, scissor.ymin};
    620         POS bottomRight{scissor.xmax, scissor.ymax};
    622         // construct 4 scissor edges in ccw direction
    623         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
    624         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
    625         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
    626         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
    628         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
    629         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
    630         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
    631         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
    633         // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
    634         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
    635         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
    636         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
    637         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
    639         // Upper left rule for scissor
    640         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
    641         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
    642     }
    643 };
    645 //////////////////////////////////////////////////////////////////////////
    646 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
    647 /// specialization. Instantiated when scissor is enabled and conservative rast
    648 /// is disabled.
    649 template <typename RT>
    650 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
    651 {
    652     //////////////////////////////////////////////////////////////////////////
    653     /// @brief Compute scissor edge vectors and evaluate edge equations
    654     INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
    655                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
    656     {
    657         const SWR_RECT &scissor = scissorBBox;
    658         POS topLeft{scissor.xmin, scissor.ymin};
    659         POS bottomLeft{scissor.xmin, scissor.ymax};
    660         POS topRight{scissor.xmax, scissor.ymin};
    661         POS bottomRight{scissor.xmax, scissor.ymax};
    663         // construct 4 scissor edges in ccw direction
    664         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
    665         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
    666         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
    667         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
    669         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
    670         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
    671         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
    672         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
    674         // Upper left rule for scissor
    675         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
    676         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
    677     }
    678 };
    680 //////////////////////////////////////////////////////////////////////////
    681 /// @brief Primary function template for TrivialRejectTest. Should
    682 /// never be called, but TemplateUnroller instantiates a few unused values,
    683 /// so it calls a runtime assert instead of a static_assert.
    684 template <typename ValidEdgeMaskT>
    685 INLINE bool TrivialRejectTest(const int, const int, const int)
    686 {
    687     SWR_ASSERT(0, "Primary templated function should never be called");
    688     return false;
    689 };
    691 //////////////////////////////////////////////////////////////////////////
    692 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
    693 /// and edge 1 for trivial coverage reject
    694 template <>
    695 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
    696 {
    697     return (!(mask0 && mask1)) ? true : false;
    698 };
    700 //////////////////////////////////////////////////////////////////////////
    701 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
    702 /// and edge 2 for trivial coverage reject
    703 template <>
    704 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
    705 {
    706     return (!(mask0 && mask2)) ? true : false;
    707 };
    709 //////////////////////////////////////////////////////////////////////////
    710 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
    711 /// and edge 2 for trivial coverage reject
    712 template <>
    713 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
    714 {
    715     return (!(mask1 && mask2)) ? true : false;
    716 };
    718 //////////////////////////////////////////////////////////////////////////
    719 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
    720 /// primitive edges for trivial coverage reject
    721 template <>
    722 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
    723 {
    724     return (!(mask0 && mask1 && mask2)) ? true : false;;
    725 };
    727 //////////////////////////////////////////////////////////////////////////
    728 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
    729 /// point, so return false and rasterize against conservative BBox
    730 template <>
    731 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
    732 {
    733     return false;
    734 };
    736 //////////////////////////////////////////////////////////////////////////
    737 /// @brief Primary function template for TrivialAcceptTest. Always returns
    738 /// false, since it will only be called for degenerate tris, and as such
    739 /// will never cover the entire raster tile
    740 template <typename ScissorEnableT>
    741 INLINE bool TrivialAcceptTest(const int, const int, const int)
    742 {
    743     return false;
    744 };
    746 //////////////////////////////////////////////////////////////////////////
    747 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
    748 /// edge masks for a fully covered raster tile
    749 template <>
    750 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
    751 {
    752     return ((mask0 & mask1 & mask2) == 0xf);
    753 };
    755 //////////////////////////////////////////////////////////////////////////
    756 /// @brief Primary function template for GenerateSVInnerCoverage. Results
    757 /// in an empty function call if SVInnerCoverage isn't requested
    758 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
    759 struct GenerateSVInnerCoverage
    760 {
    761     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
    762 };
    764 //////////////////////////////////////////////////////////////////////////
    765 /// @brief Specialization of GenerateSVInnerCoverage where all edges
    766 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
    767 /// edge values from OuterConservative to InnerConservative and rasterizes.
    768 template <typename RT>
    769 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
    770 {
    771     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
    772     {
    773         SWR_CONTEXT *pContext = pDC->pContext;
    775         double startQuadEdgesAdj[RT::NumEdgesT::value];
    776         for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
    777         {
    778             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
    779         }
    781         // not trivial accept or reject, must rasterize full tile
    782         AR_BEGIN(BERasterizePartial, pDC->drawId);
    783         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
    784         AR_END(BERasterizePartial, 0);
    785     }
    786 };
    788 //////////////////////////////////////////////////////////////////////////
    789 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
    790 /// in an empty function call if SVInnerCoverage isn't requested
    791 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
    792 struct UpdateEdgeMasksInnerConservative
    793 {
    794     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
    795                                            const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
    796 };
    798 //////////////////////////////////////////////////////////////////////////
    799 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
    800 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
    801 /// evaluated at raster tile corners to inner conservative position and
    802 /// updates edge masks
    803 template <typename RT>
    804 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
    805 {
    806     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
    807                                            const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
    808     {
    809         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
    811         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
    812         // conservative evaluated edge when adjusting the edge in for inner conservative tests
    813         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
    814         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
    815         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
    817         UpdateEdgeMasks<typename RT::NumRasterSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
    818     }
    819 };
    821 //////////////////////////////////////////////////////////////////////////
    822 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
    823 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
    824 /// cover an entire raster tile, set mask0 to 0 to force it down the
    825 /// rastierizePartialTile path
    826 template <typename RT, typename ValidEdgeMaskT>
    827 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
    828 {
    829     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
    830                                    const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
    831     {
    832         // set one mask to zero to force the triangle down the rastierizePartialTile path
    833         mask0 = 0;
    834     }
    835 };
    837 template <typename RT>
    838 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
    839 {
    840     SWR_CONTEXT *pContext = pDC->pContext;
    841     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
    843     if (KNOB_TOSS_BIN_TRIS)
    844     {
    845         return;
    846     }
    847 #endif
    848     AR_BEGIN(BERasterizeTriangle, pDC->drawId);
    849     AR_BEGIN(BETriangleSetup, pDC->drawId);
    851     const API_STATE &state = GetApiState(pDC);
    852     const SWR_RASTSTATE &rastState = state.rastState;
    853     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
    856     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
    858     __m128 vX, vY, vZ, vRecipW;
    860     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
    861     // eg: vX = [x0 x1 x2 dc]
    862     vX = _mm_load_ps(workDesc.pTriBuffer);
    863     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
    864     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
    865     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
    867     // convert to fixed point
    868     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
    869     __m128i vXi = fpToFixedPoint(vX);
    870     __m128i vYi = fpToFixedPoint(vY);
    872     // quantize floating point position to fixed point precision
    873     // to prevent attribute creep around the triangle vertices
    874     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
    875     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
    877     // triangle setup - A and B edge equation coefs
    878     __m128 vA, vB;
    879     triangleSetupAB(vX, vY, vA, vB);
    881     __m128i vAi, vBi;
    882     triangleSetupABInt(vXi, vYi, vAi, vBi);
    884     // determinant
    885     float det = calcDeterminantInt(vAi, vBi);
    887     // Verts in Pixel Coordinate Space at this point
    888     // Det > 0 = CW winding order
    889     // Convert CW triangles to CCW
    890     if (det > 0.0)
    891     {
    892         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
    893         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
    894         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
    895         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
    896         det = -det;
    897     }
    899     __m128 vC;
    900     // Finish triangle setup - C edge coef
    901     triangleSetupC(vX, vY, vA, vB, vC);
    903     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
    904     {
    905         // If we have degenerate edge(s) to rasterize, set I and J coefs
    906         // to 0 for constant interpolation of attributes
    907         triDesc.I[0] = 0.0f;
    908         triDesc.I[1] = 0.0f;
    909         triDesc.I[2] = 0.0f;
    910         triDesc.J[0] = 0.0f;
    911         triDesc.J[1] = 0.0f;
    912         triDesc.J[2] = 0.0f;
    914         // Degenerate triangles have no area
    915         triDesc.recipDet = 0.0f;
    916     }
    917     else
    918     {
    919         // only extract coefs for 2 of the barycentrics; the 3rd can be
    920         // determined from the barycentric equation:
    921         // i + j + k = 1 <=> k = 1 - j - i
    922         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
    923         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
    924         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
    925         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
    926         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
    927         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
    929         // compute recipDet, used to calculate barycentric i and j in the backend
    930         triDesc.recipDet = 1.0f/det;
    931     }
    933     OSALIGNSIMD(float) oneOverW[4];
    934     _mm_store_ps(oneOverW, vRecipW);
    935     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
    936     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
    937     triDesc.OneOverW[2] = oneOverW[2];
    939     // calculate perspective correct coefs per vertex attrib
    940     float* pPerspAttribs = perspAttribsTLS;
    941     float* pAttribs = workDesc.pAttribs;
    942     triDesc.pPerspAttribs = pPerspAttribs;
    943     triDesc.pAttribs = pAttribs;
    944     float *pRecipW = workDesc.pTriBuffer + 12;
    945     triDesc.pRecipW = pRecipW;
    946     __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
    947     __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
    948     __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
    949     for(uint32_t i = 0; i < workDesc.numAttribs; i++)
    950     {
    951         __m128 attribA = _mm_load_ps(pAttribs);
    952         __m128 attribB = _mm_load_ps(pAttribs+=4);
    953         __m128 attribC = _mm_load_ps(pAttribs+=4);
    954         pAttribs+=4;
    956         attribA = _mm_mul_ps(attribA, vOneOverWV0);
    957         attribB = _mm_mul_ps(attribB, vOneOverWV1);
    958         attribC = _mm_mul_ps(attribC, vOneOverWV2);
    960         _mm_store_ps(pPerspAttribs, attribA);
    961         _mm_store_ps(pPerspAttribs+=4, attribB);
    962         _mm_store_ps(pPerspAttribs+=4, attribC);
    963         pPerspAttribs+=4;
    964     }
    966     // compute bary Z
    967     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
    968     OSALIGNSIMD(float) a[4];
    969     _mm_store_ps(a, vZ);
    970     triDesc.Z[0] = a[0] - a[2];
    971     triDesc.Z[1] = a[1] - a[2];
    972     triDesc.Z[2] = a[2];
    974     // add depth bias
    975     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
    977     // Calc bounding box of triangle
    978     OSALIGNSIMD(SWR_RECT) bbox;
    979     calcBoundingBoxInt(vXi, vYi, bbox);
    981     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
    983     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
    984     {
    985         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
    986         bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
    987         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
    988                    "Conservative rast degenerate handling requires a valid scissor rect");
    989     }
    991     // Intersect with scissor/viewport
    992     OSALIGNSIMD(SWR_RECT) intersect;
    993     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
    994     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
    995     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
    996     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
    998     triDesc.triFlags = workDesc.triFlags;
   1000     // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
   1001     uint32_t macroX, macroY;
   1002     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
   1003     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
   1004     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
   1005     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
   1006     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
   1008     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
   1009     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
   1010     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
   1011     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
   1013     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
   1015     AR_END(BETriangleSetup, 0);
   1017     // update triangle desc
   1018     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
   1019     uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
   1020     uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
   1021     uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
   1022     uint32_t numTilesX = maxTileX - minTileX + 1;
   1023     uint32_t numTilesY = maxTileY - minTileY + 1;
   1025     if (numTilesX == 0 || numTilesY == 0)
   1026     {
   1027         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
   1028         AR_END(BERasterizeTriangle, 1);
   1029         return;
   1030     }
   1032     AR_BEGIN(BEStepSetup, pDC->drawId);
   1034     // Step to pixel center of top-left pixel of the triangle bbox
   1035     // Align intersect bbox (top/left) to raster tile's (top/left).
   1036     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
   1037     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
   1039     // convenience typedef
   1040     typedef typename RT::NumRasterSamplesT NumRasterSamplesT;
   1042     // single sample rasterization evaluates edges at pixel center,
   1043     // multisample evaluates edges UL pixel corner and steps to each sample position
   1044     if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
   1045     {
   1046         // Add 0.5, in fixed point, to offset to pixel center
   1047         x += (FIXED_POINT_SCALE / 2);
   1048         y += (FIXED_POINT_SCALE / 2);
   1049     }
   1051     __m128i vTopLeftX = _mm_set1_epi32(x);
   1052     __m128i vTopLeftY = _mm_set1_epi32(y);
   1054     // evaluate edge equations at top-left pixel using 64bit math
   1055     //
   1056     // line = Ax + By + C
   1057     // solving for C:
   1058     // C = -Ax - By
   1059     // we know x0 and y0 are on the line; plug them in:
   1060     // C = -Ax0 - By0
   1061     // plug C back into line equation:
   1062     // line = Ax - By - Ax0 - By0
   1063     // line = A(x - x0) + B(y - y0)
   1064     // dX = (x-x0), dY = (y-y0)
   1065     // so all this simplifies to
   1066     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
   1068     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
   1069     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
   1071     // evaluate A(dx) and B(dY) for all points
   1072     __m256d vAipd = _mm256_cvtepi32_pd(vAi);
   1073     __m256d vBipd = _mm256_cvtepi32_pd(vBi);
   1074     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
   1075     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
   1077     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
   1078     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
   1079     __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
   1081     // apply any edge adjustments(top-left, crast, etc)
   1082     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
   1084     // broadcast respective edge results to all lanes
   1085     double* pEdge = (double*)&vEdge;
   1086     __m256d vEdgeFix16[7];
   1087     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
   1088     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
   1089     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
   1091     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
   1092     _mm_store_si128((__m128i*)aAi, vAi);
   1093     _mm_store_si128((__m128i*)aBi, vBi);
   1094     EDGE rastEdges[RT::NumEdgesT::value];
   1096     // Compute and store triangle edge data
   1097     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
   1098     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
   1099     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
   1101     // Compute and store triangle edge data if scissor needs to rasterized
   1102     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
   1103                        (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
   1105     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
   1106     // used to for testing if entire raster tile is inside a triangle
   1107     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1108     {
   1109         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
   1110     }
   1112     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
   1113     // step sample positions to the raster tile bbox of multisample points
   1114     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
   1115     //                             |      |
   1116     //                             |      |
   1117     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
   1118     __m256d vEdgeTileBbox[3];
   1119     if (NumRasterSamplesT::value > 1)
   1120     {
   1121         __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX();
   1122         __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY();
   1124         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
   1125         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
   1127         // step edge equation tests from Tile
   1128         // used to for testing if entire raster tile is inside a triangle
   1129         for (uint32_t e = 0; e < 3; ++e)
   1130         {
   1131             __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
   1132             __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
   1133             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
   1135             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
   1136             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
   1137         }
   1138     }
   1140     AR_END(BEStepSetup, 0);
   1142     uint32_t tY = minTileY;
   1143     uint32_t tX = minTileX;
   1144     uint32_t maxY = maxTileY;
   1145     uint32_t maxX = maxTileX;
   1147     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
   1148     GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
   1149     currentRenderBufferRow = renderBuffers;
   1151     // rasterize and generate coverage masks per sample
   1152     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
   1153     {
   1154         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
   1155         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1156         {
   1157             vStartOfRowEdge[e] = vEdgeFix16[e];
   1158         }
   1160         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
   1161         {
   1162             triDesc.anyCoveredSamples = 0;
   1164             // is the corner of the edge outside of the raster tile? (vEdge < 0)
   1165             int mask0, mask1, mask2;
   1166             UpdateEdgeMasks<NumRasterSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
   1168             for (uint32_t sampleNum = 0; sampleNum < NumRasterSamplesT::value; sampleNum++)
   1169             {
   1170                 // trivial reject, at least one edge has all 4 corners of raster tile outside
   1171                 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
   1173                 if (!trivialReject)
   1174                 {
   1175                     // trivial accept mask
   1176                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
   1178                     // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
   1179                     UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
   1180                         (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
   1182                     // @todo Make this a bit smarter to allow use of trivial accept when:
   1183                     //   1) scissor/vp intersection rect is raster tile aligned
   1184                     //   2) raster tile is entirely within scissor/vp intersection rect
   1185                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
   1186                     {
   1187                         // trivial accept, all 4 corners of all 3 edges are negative
   1188                         // i.e. raster tile completely inside triangle
   1189                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
   1190                         if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
   1191                         {
   1192                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
   1193                         }
   1194                         RDTSC_EVENT(BETrivialAccept, 1, 0);
   1195                     }
   1196                     else
   1197                     {
   1198                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
   1199                         if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
   1200                         {
   1201                             // should get optimized out for single sample case (global value numbering or copy propagation)
   1202                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1203                             {
   1204                                 vEdgeAtSample[e] = vEdgeFix16[e];
   1205                             }
   1206                         }
   1207                         else
   1208                         {
   1209                             __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum);
   1210                             __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum);
   1211                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
   1212                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
   1214                             // step edge equation tests from UL tile corner to pixel sample position
   1215                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1216                             {
   1217                                 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
   1218                                 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
   1219                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
   1220                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
   1221                             }
   1222                         }
   1224                         double startQuadEdges[RT::NumEdgesT::value];
   1225                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
   1226                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1227                         {
   1228                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
   1229                         }
   1231                         // not trivial accept or reject, must rasterize full tile
   1232                         AR_BEGIN(BERasterizePartial, pDC->drawId);
   1233                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
   1234                         AR_END(BERasterizePartial, 0);
   1236                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
   1238                         // Output SV InnerCoverage, if needed
   1239                         GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
   1240                     }
   1241                 }
   1242                 else
   1243                 {
   1244                     // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
   1245                     if(NumRasterSamplesT::value > 1)
   1246                     {
   1247                         triDesc.coverageMask[sampleNum] = 0;
   1248                     }
   1249                     RDTSC_EVENT(BETrivialReject, 1, 0);
   1250                 }
   1251             }
   1254             if(KNOB_TOSS_RS)
   1255             {
   1256                 gToss = triDesc.coverageMask[0];
   1257             }
   1258             else
   1259 #endif
   1260             if(triDesc.anyCoveredSamples)
   1261             {
   1262                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
   1263                 // copy conservative coverage result to all samples
   1264                 if(RT::IsConservativeT::value)
   1265                 {
   1266                     auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
   1267                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
   1268                 }
   1270                 AR_BEGIN(BEPixelBackend, pDC->drawId);
   1271                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
   1272                 AR_END(BEPixelBackend, 0);
   1273             }
   1275             // step to the next tile in X
   1276             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1277             {
   1278                 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
   1279             }
   1280             StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
   1281         }
   1283         // step to the next tile in Y
   1284         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
   1285         {
   1286             vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
   1287         }
   1288         StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
   1289     }
   1291     AR_END(BERasterizeTriangle, 1);
   1292 }
   1294 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
   1295 {
   1296     const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
   1297     const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
   1298     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
   1300     bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
   1302     // load point vertex
   1303     float x = *workDesc.pTriBuffer;
   1304     float y = *(workDesc.pTriBuffer + 1);
   1305     float z = *(workDesc.pTriBuffer + 2);
   1307     // create a copy of the triangle buffer to write our adjusted vertices to
   1308     OSALIGNSIMD(float) newTriBuffer[4 * 4];
   1309     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
   1310     newWorkDesc.pTriBuffer = &newTriBuffer[0];
   1312     // create a copy of the attrib buffer to write our adjusted attribs to
   1313     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
   1314     newWorkDesc.pAttribs = &newAttribBuffer[0];
   1316     newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
   1317     newWorkDesc.numAttribs = workDesc.numAttribs;
   1318     newWorkDesc.triFlags = workDesc.triFlags;
   1320     // construct two tris by bloating point by point size
   1321     float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
   1322     float lowerX = x - halfPointSize;
   1323     float upperX = x + halfPointSize;
   1324     float lowerY = y - halfPointSize;
   1325     float upperY = y + halfPointSize;
   1327     // tri 0
   1328     float *pBuf = &newTriBuffer[0];
   1329     *pBuf++ = lowerX;
   1330     *pBuf++ = lowerX;
   1331     *pBuf++ = upperX;
   1332     pBuf++;
   1333     *pBuf++ = lowerY;
   1334     *pBuf++ = upperY;
   1335     *pBuf++ = upperY;
   1336     pBuf++;
   1337     _mm_store_ps(pBuf, _mm_set1_ps(z));
   1338     _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
   1340     // setup triangle rasterizer function
   1341     PFN_WORK_FUNC pfnTriRast;
   1342     // for center sample pattern, all samples are at pixel center; calculate coverage
   1343     // once at center and broadcast the results in the backend
   1344     uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
   1345     // conservative rast not supported for points/lines
   1346     pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false));
   1348     // overwrite texcoords for point sprites
   1349     if (isPointSpriteTexCoordEnabled)
   1350     {
   1351         // copy original attribs
   1352         memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
   1353         newWorkDesc.pAttribs = &newAttribBuffer[0];
   1355         // overwrite texcoord for point sprites
   1356         uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
   1357         DWORD texCoordAttrib = 0;
   1359         while (_BitScanForward(&texCoordAttrib, texCoordMask))
   1360         {
   1361             texCoordMask &= ~(1 << texCoordAttrib);
   1362             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
   1363             if (rastState.pointSpriteTopOrigin)
   1364             {
   1365                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
   1366                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
   1367                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
   1368             }
   1369             else
   1370             {
   1371                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
   1372                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
   1373                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
   1374             }
   1375         }
   1376     }
   1377     else
   1378     {
   1379         // no texcoord overwrite, can reuse the attrib buffer from frontend
   1380         newWorkDesc.pAttribs = workDesc.pAttribs;
   1381     }
   1383     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
   1385     // tri 1
   1386     pBuf = &newTriBuffer[0];
   1387     *pBuf++ = lowerX;
   1388     *pBuf++ = upperX;
   1389     *pBuf++ = upperX;
   1390     pBuf++;
   1391     *pBuf++ = lowerY;
   1392     *pBuf++ = upperY;
   1393     *pBuf++ = lowerY;
   1394     // z, w unchanged
   1396     if (isPointSpriteTexCoordEnabled)
   1397     {
   1398         uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
   1399         DWORD texCoordAttrib = 0;
   1401         while (_BitScanForward(&texCoordAttrib, texCoordMask))
   1402         {
   1403             texCoordMask &= ~(1 << texCoordAttrib);
   1404             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
   1405             if (rastState.pointSpriteTopOrigin)
   1406             {
   1407                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
   1408                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
   1409                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
   1411             }
   1412             else
   1413             {
   1414                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
   1415                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
   1416                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
   1417             }
   1418         }
   1419     }
   1421     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
   1422 }
   1424 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
   1425 {
   1426     SWR_CONTEXT *pContext = pDC->pContext;
   1429     if (KNOB_TOSS_BIN_TRIS)
   1430     {
   1431         return;
   1432     }
   1433 #endif
   1435     const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
   1436     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
   1438     // map x,y relative offsets from start of raster tile to bit position in
   1439     // coverage mask for the point
   1440     static const uint32_t coverageMap[8][8] = {
   1441         { 0, 1, 4, 5, 8, 9, 12, 13 },
   1442         { 2, 3, 6, 7, 10, 11, 14, 15 },
   1443         { 16, 17, 20, 21, 24, 25, 28, 29 },
   1444         { 18, 19, 22, 23, 26, 27, 30, 31 },
   1445         { 32, 33, 36, 37, 40, 41, 44, 45 },
   1446         { 34, 35, 38, 39, 42, 43, 46, 47 },
   1447         { 48, 49, 52, 53, 56, 57, 60, 61 },
   1448         { 50, 51, 54, 55, 58, 59, 62, 63 }
   1449     };
   1453     // pull point information from triangle buffer
   1454     // @todo use structs for readability
   1455     uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
   1456     uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
   1457     float z = *(workDesc.pTriBuffer + 2);
   1459     // construct triangle descriptor for point
   1460     // no interpolation, set up i,j for constant interpolation of z and attribs
   1461     // @todo implement an optimized backend that doesn't require triangle information
   1463     // compute coverage mask from x,y packed into the coverageMask flag
   1464     // mask indices by the maximum valid index for x/y of coveragemap.
   1465     uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
   1466     uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
   1467     // todo: multisample points?
   1468     triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
   1470     // no persp divide needed for points
   1471     triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
   1472     triDesc.triFlags = workDesc.triFlags;
   1473     triDesc.recipDet = 1.0f;
   1474     triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
   1475     triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
   1476     triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
   1477     triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
   1479     RenderOutputBuffers renderBuffers;
   1480     GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
   1481         renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
   1483     AR_BEGIN(BEPixelBackend, pDC->drawId);
   1484     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
   1485     AR_END(BEPixelBackend, 0);
   1486 }
   1488 // Get pointers to hot tile memory for color RT, depth, stencil
   1489 template <uint32_t numSamples>
   1490 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
   1491 {
   1492     const API_STATE& state = GetApiState(pDC);
   1493     SWR_CONTEXT *pContext = pDC->pContext;
   1495     uint32_t mx, my;
   1496     MacroTileMgr::getTileIndices(macroID, mx, my);
   1497     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
   1498     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
   1500     // compute tile offset for active hottile buffers
   1501     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
   1502     uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
   1503     offset*=numSamples;
   1505     unsigned long rtSlot = 0;
   1506     uint32_t colorHottileEnableMask = state.colorHottileEnable;
   1507     while(_BitScanForward(&rtSlot, colorHottileEnableMask))
   1508     {
   1509         HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
   1510             numSamples, renderTargetArrayIndex);
   1511         pColor->state = HOTTILE_DIRTY;
   1512         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
   1514         colorHottileEnableMask &= ~(1 << rtSlot);
   1515     }
   1516     if(state.depthHottileEnable)
   1517     {
   1518         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
   1519         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
   1520         offset*=numSamples;
   1521         HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
   1522             numSamples, renderTargetArrayIndex);
   1523         pDepth->state = HOTTILE_DIRTY;
   1524         SWR_ASSERT(pDepth->pBuffer != nullptr);
   1525         renderBuffers.pDepth = pDepth->pBuffer + offset;
   1526     }
   1527     if(state.stencilHottileEnable)
   1528     {
   1529         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
   1530         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
   1531         offset*=numSamples;
   1532         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
   1533             numSamples, renderTargetArrayIndex);
   1534         pStencil->state = HOTTILE_DIRTY;
   1535         SWR_ASSERT(pStencil->pBuffer != nullptr);
   1536         renderBuffers.pStencil = pStencil->pBuffer + offset;
   1537     }
   1538 }
   1540 template <typename RT>
   1541 INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
   1542 {
   1543     for(uint32_t rt = 0; rt < NumRT; ++rt)
   1544     {
   1545         buffers.pColor[rt] += RT::colorRasterTileStep;
   1546     }
   1548     buffers.pDepth += RT::depthRasterTileStep;
   1549     buffers.pStencil += RT::stencilRasterTileStep;
   1550 }
   1552 template <typename RT>
   1553 INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
   1554 {
   1555     for(uint32_t rt = 0; rt < NumRT; ++rt)
   1556     {
   1557         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
   1558         buffers.pColor[rt] = startBufferRow.pColor[rt];
   1559     }
   1560     startBufferRow.pDepth += RT::depthRasterTileRowStep;
   1561     buffers.pDepth = startBufferRow.pDepth;
   1563     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
   1564     buffers.pStencil = startBufferRow.pStencil;
   1565 }
   1567 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
   1568 {
   1569     SWR_CONTEXT *pContext = pDC->pContext;
   1570     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
   1572     if (KNOB_TOSS_BIN_TRIS)
   1573     {
   1574         return;
   1575     }
   1576 #endif
   1578     // bloat line to two tris and call the triangle rasterizer twice
   1579     AR_BEGIN(BERasterizeLine, pDC->drawId);
   1581     const API_STATE &state = GetApiState(pDC);
   1582     const SWR_RASTSTATE &rastState = state.rastState;
   1584     // macrotile dimensioning
   1585     uint32_t macroX, macroY;
   1586     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
   1587     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
   1588     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
   1589     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
   1590     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
   1592     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
   1594     // create a copy of the triangle buffer to write our adjusted vertices to
   1595     OSALIGNSIMD(float) newTriBuffer[4 * 4];
   1596     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
   1597     newWorkDesc.pTriBuffer = &newTriBuffer[0];
   1599     // create a copy of the attrib buffer to write our adjusted attribs to
   1600     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
   1601     newWorkDesc.pAttribs = &newAttribBuffer[0];
   1603     const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
   1604     const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
   1606     __m128 vX, vY, vZ, vRecipW;
   1608     vX = _mm_load_ps(workDesc.pTriBuffer);
   1609     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
   1610     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
   1611     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
   1613     // triangle 0
   1614     // v0,v1 -> v0,v0,v1
   1615     __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
   1616     __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
   1617     __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
   1618     __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
   1620     __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
   1621     __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
   1622     if (workDesc.triFlags.yMajor)
   1623     {
   1624         vXa = _mm_add_ps(vAdjust, vXa);
   1625     }
   1626     else
   1627     {
   1628         vYa = _mm_add_ps(vAdjust, vYa);
   1629     }
   1631     // Store triangle description for rasterizer
   1632     _mm_store_ps((float*)&newTriBuffer[0], vXa);
   1633     _mm_store_ps((float*)&newTriBuffer[4], vYa);
   1634     _mm_store_ps((float*)&newTriBuffer[8], vZa);
   1635     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
   1637     // binner bins 3 edges for lines as v0, v1, v1
   1638     // tri0 needs v0, v0, v1
   1639     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
   1640     {
   1641         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
   1642         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
   1644         _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
   1645         _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
   1646         _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
   1647     }
   1649     // Store user clip distances for triangle 0
   1650     float newClipBuffer[3 * 8];
   1651     uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
   1652     if (numClipDist)
   1653     {
   1654         newWorkDesc.pUserClipBuffer = newClipBuffer;
   1656         float* pOldBuffer = workDesc.pUserClipBuffer;
   1657         float* pNewBuffer = newClipBuffer;
   1658         for (uint32_t i = 0; i < numClipDist; ++i)
   1659         {
   1660             // read barycentric coeffs from binner
   1661             float a = *(pOldBuffer++);
   1662             float b = *(pOldBuffer++);
   1664             // reconstruct original clip distance at vertices
   1665             float c0 = a + b;
   1666             float c1 = b;
   1668             // construct triangle barycentrics
   1669             *(pNewBuffer++) = c0 - c1;
   1670             *(pNewBuffer++) = c0 - c1;
   1671             *(pNewBuffer++) = c1;
   1672         }
   1673     }
   1675     // setup triangle rasterizer function
   1676     PFN_WORK_FUNC pfnTriRast;
   1677     uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
   1678     // conservative rast not supported for points/lines
   1679     pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false));
   1681     // make sure this macrotile intersects the triangle
   1682     __m128i vXai = fpToFixedPoint(vXa);
   1683     __m128i vYai = fpToFixedPoint(vYa);
   1684     OSALIGNSIMD(SWR_RECT) bboxA;
   1685     calcBoundingBoxInt(vXai, vYai, bboxA);
   1687     if (!(bboxA.xmin > macroBoxRight ||
   1688           bboxA.xmin > scissorInFixedPoint.xmax ||
   1689           bboxA.xmax - 1 < macroBoxLeft ||
   1690           bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
   1691           bboxA.ymin > macroBoxBottom ||
   1692           bboxA.ymin > scissorInFixedPoint.ymax ||
   1693           bboxA.ymax - 1 < macroBoxTop ||
   1694           bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
   1695         // rasterize triangle
   1696         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
   1697     }
   1699     // triangle 1
   1700     // v0,v1 -> v1,v1,v0
   1701     vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
   1702     vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
   1703     vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
   1704     vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
   1706     vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
   1707     if (workDesc.triFlags.yMajor)
   1708     {
   1709         vXa = _mm_add_ps(vAdjust, vXa);
   1710     }
   1711     else
   1712     {
   1713         vYa = _mm_add_ps(vAdjust, vYa);
   1714     }
   1716     // Store triangle description for rasterizer
   1717     _mm_store_ps((float*)&newTriBuffer[0], vXa);
   1718     _mm_store_ps((float*)&newTriBuffer[4], vYa);
   1719     _mm_store_ps((float*)&newTriBuffer[8], vZa);
   1720     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
   1722     // binner bins 3 edges for lines as v0, v1, v1
   1723     // tri1 needs v1, v1, v0
   1724     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
   1725     {
   1726         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
   1727         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
   1729         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
   1730         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
   1731         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
   1732     }
   1734     // store user clip distance for triangle 1
   1735     if (numClipDist)
   1736     {
   1737         float* pOldBuffer = workDesc.pUserClipBuffer;
   1738         float* pNewBuffer = newClipBuffer;
   1739         for (uint32_t i = 0; i < numClipDist; ++i)
   1740         {
   1741             // read barycentric coeffs from binner
   1742             float a = *(pOldBuffer++);
   1743             float b = *(pOldBuffer++);
   1745             // reconstruct original clip distance at vertices
   1746             float c0 = a + b;
   1747             float c1 = b;
   1749             // construct triangle barycentrics
   1750             *(pNewBuffer++) = c1 - c0;
   1751             *(pNewBuffer++) = c1 - c0;
   1752             *(pNewBuffer++) = c0;
   1753         }
   1754     }
   1756     vXai = fpToFixedPoint(vXa);
   1757     vYai = fpToFixedPoint(vYa);
   1758     calcBoundingBoxInt(vXai, vYai, bboxA);
   1760     if (!(bboxA.xmin > macroBoxRight ||
   1761           bboxA.xmin > scissorInFixedPoint.xmax ||
   1762           bboxA.xmax - 1 < macroBoxLeft ||
   1763           bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
   1764           bboxA.ymin > macroBoxBottom ||
   1765           bboxA.ymin > scissorInFixedPoint.ymax ||
   1766           bboxA.ymax - 1 < macroBoxTop ||
   1767           bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
   1768         // rasterize triangle
   1769         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
   1770     }
   1772     AR_END(BERasterizeLine, 1);
   1773 }
   1775 struct RasterizerChooser
   1776 {
   1777     typedef PFN_WORK_FUNC FuncType;
   1779     template <typename... ArgsB>
   1780     static FuncType GetFunc()
   1781     {
   1782         return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
   1783     }
   1784 };
   1786 // Selector for correct templated RasterizeTriangle function
   1787 PFN_WORK_FUNC GetRasterizerFunc(
   1788     uint32_t numSamples,
   1789     bool IsConservative,
   1790     uint32_t InputCoverage,
   1791     uint32_t EdgeEnable,
   1792     bool RasterizeScissorEdges
   1793 )
   1794 {
   1795     return TemplateArgUnroller<RasterizerChooser>::GetFunc(
   1796         IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
   1797         IsConservative,
   1798         IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
   1799         IntArg<0, VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
   1800         RasterizeScissorEdges);
   1801 }