Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file binner.cpp
     24 *
     25 * @brief Implementation for the macrotile binner
     26 *
     27 ******************************************************************************/
     28 
     29 #include "context.h"
     30 #include "frontend.h"
     31 #include "conservativeRast.h"
     32 #include "pa.h"
     33 #include "rasterizer.h"
     34 #include "rdtsc_core.h"
     35 #include "tilemgr.h"
     36 
     37 // Function Prototype
     38 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
     39 
     40 //////////////////////////////////////////////////////////////////////////
     41 /// @brief Offsets added to post-viewport vertex positions based on
     42 /// raster state.
     43 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
     44 {
     45     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
     46     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
     47 };
     48 
     49 //////////////////////////////////////////////////////////////////////////
     50 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
     51 /// Point precision from FP32.
     52 template <typename PT = FixedPointTraits<Fixed_16_8>>
     53 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
     54 {
     55     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
     56     return _simd_cvtps_epi32(vFixed);
     57 }
     58 
     59 //////////////////////////////////////////////////////////////////////////
     60 /// @brief Helper function to set the X,Y coords of a triangle to the
     61 /// requested Fixed Point precision from FP32.
     62 /// @param tri: simdvector[3] of FP triangle verts
     63 /// @param vXi: fixed point X coords of tri verts
     64 /// @param vYi: fixed point Y coords of tri verts
     65 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
     66 {
     67     vXi[0] = fpToFixedPointVertical(tri[0].x);
     68     vYi[0] = fpToFixedPointVertical(tri[0].y);
     69     vXi[1] = fpToFixedPointVertical(tri[1].x);
     70     vYi[1] = fpToFixedPointVertical(tri[1].y);
     71     vXi[2] = fpToFixedPointVertical(tri[2].x);
     72     vYi[2] = fpToFixedPointVertical(tri[2].y);
     73 }
     74 
     75 //////////////////////////////////////////////////////////////////////////
     76 /// @brief Calculate bounding box for current triangle
     77 /// @tparam CT: ConservativeRastFETraits type
     78 /// @param vX: fixed point X position for triangle verts
     79 /// @param vY: fixed point Y position for triangle verts
     80 /// @param bbox: fixed point bbox
     81 /// *Note*: expects vX, vY to be in the correct precision for the type
     82 /// of rasterization. This avoids unnecessary FP->fixed conversions.
     83 template <typename CT>
     84 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
     85 {
     86     simdscalari vMinX = vX[0];
     87     vMinX = _simd_min_epi32(vMinX, vX[1]);
     88     vMinX = _simd_min_epi32(vMinX, vX[2]);
     89 
     90     simdscalari vMaxX = vX[0];
     91     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
     92     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
     93 
     94     simdscalari vMinY = vY[0];
     95     vMinY = _simd_min_epi32(vMinY, vY[1]);
     96     vMinY = _simd_min_epi32(vMinY, vY[2]);
     97 
     98     simdscalari vMaxY = vY[0];
     99     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
    100     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
    101 
    102     bbox.xmin = vMinX;
    103     bbox.xmax = vMaxX;
    104     bbox.ymin = vMinY;
    105     bbox.ymax = vMaxY;
    106 }
    107 
    108 //////////////////////////////////////////////////////////////////////////
    109 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
    110 /// Offsets BBox for conservative rast
    111 template <>
    112 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
    113 {
    114     // FE conservative rast traits
    115     typedef FEConservativeRastT CT;
    116 
    117     simdscalari vMinX = vX[0];
    118     vMinX = _simd_min_epi32(vMinX, vX[1]);
    119     vMinX = _simd_min_epi32(vMinX, vX[2]);
    120 
    121     simdscalari vMaxX = vX[0];
    122     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
    123     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
    124 
    125     simdscalari vMinY = vY[0];
    126     vMinY = _simd_min_epi32(vMinY, vY[1]);
    127     vMinY = _simd_min_epi32(vMinY, vY[2]);
    128 
    129     simdscalari vMaxY = vY[0];
    130     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
    131     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
    132 
    133     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
    134     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
    135     bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
    136     bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
    137     bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
    138     bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
    139 }
    140 
    141 //////////////////////////////////////////////////////////////////////////
    142 /// @brief Processes attributes for the backend based on linkage mask and
    143 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
    144 /// @param pDC - Draw context
    145 /// @param pa - Primitive Assembly state
    146 /// @param linkageMask - Specifies which VS outputs are routed to PS.
    147 /// @param pLinkageMap - maps VS attribute slot to PS slot
    148 /// @param triIndex - Triangle to process attributes for
    149 /// @param pBuffer - Output result
    150 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
    151 INLINE void ProcessAttributes(
    152     DRAW_CONTEXT *pDC,
    153     PA_STATE&pa,
    154     uint32_t triIndex,
    155     uint32_t primId,
    156     float *pBuffer)
    157 {
    158     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
    159     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
    160     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
    161     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
    162     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
    163     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
    164 
    165     static const float constTable[3][4] = {
    166         { 0.0f, 0.0f, 0.0f, 0.0f },
    167         { 0.0f, 0.0f, 0.0f, 1.0f },
    168         { 1.0f, 1.0f, 1.0f, 1.0f }
    169     };
    170 
    171     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
    172     {
    173         uint32_t inputSlot;
    174         if (IsSwizzledT::value)
    175         {
    176             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
    177             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
    178 
    179         }
    180         else
    181         {
    182             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
    183         }
    184 
    185         __m128 attrib[3];    // triangle attribs (always 4 wide)
    186         float* pAttribStart = pBuffer;
    187 
    188         if (HasConstantInterpT::value || IsDegenerate::value)
    189         {
    190             if (_bittest(&constantInterpMask, i))
    191             {
    192                 uint32_t vid;
    193                 uint32_t adjustedTriIndex;
    194                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
    195                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
    196                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
    197                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
    198                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
    199 
    200                 switch (topo) {
    201                 case TOP_QUAD_LIST:
    202                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
    203                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
    204                     break;
    205                 case TOP_QUAD_STRIP:
    206                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
    207                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
    208                     break;
    209                 case TOP_TRIANGLE_STRIP:
    210                     adjustedTriIndex = triIndex;
    211                     vid = (triIndex & 1)
    212                         ? tristripProvokingVertex[provokingVertex]
    213                         : provokingVertex;
    214                     break;
    215                 default:
    216                     adjustedTriIndex = triIndex;
    217                     vid = provokingVertex;
    218                     break;
    219                 }
    220 
    221                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
    222 
    223                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
    224                 {
    225                     _mm_store_ps(pBuffer, attrib[vid]);
    226                     pBuffer += 4;
    227                 }
    228             }
    229             else
    230             {
    231                 pa.AssembleSingle(inputSlot, triIndex, attrib);
    232 
    233                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
    234                 {
    235                     _mm_store_ps(pBuffer, attrib[i]);
    236                     pBuffer += 4;
    237                 }
    238             }
    239         }
    240         else
    241         {
    242             pa.AssembleSingle(inputSlot, triIndex, attrib);
    243 
    244             for (uint32_t i = 0; i < NumVertsT::value; ++i)
    245             {
    246                 _mm_store_ps(pBuffer, attrib[i]);
    247                 pBuffer += 4;
    248             }
    249         }
    250 
    251         // pad out the attrib buffer to 3 verts to ensure the triangle
    252         // interpolation code in the pixel shader works correctly for the
    253         // 3 topologies - point, line, tri.  This effectively zeros out the
    254         // effect of the missing vertices in the triangle interpolation.
    255         for (uint32_t v = NumVertsT::value; v < 3; ++v)
    256         {
    257             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
    258             pBuffer += 4;
    259         }
    260 
    261         // check for constant source overrides
    262         if (IsSwizzledT::value)
    263         {
    264             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
    265             if (mask)
    266             {
    267                 DWORD comp;
    268                 while (_BitScanForward(&comp, mask))
    269                 {
    270                     mask &= ~(1 << comp);
    271 
    272                     float constantValue = 0.0f;
    273                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
    274                     {
    275                     case SWR_CONSTANT_SOURCE_CONST_0000:
    276                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
    277                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
    278                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
    279                         break;
    280                     case SWR_CONSTANT_SOURCE_PRIM_ID:
    281                         constantValue = *(float*)&primId;
    282                         break;
    283                     }
    284 
    285                     // apply constant value to all 3 vertices
    286                     for (uint32_t v = 0; v < 3; ++v)
    287                     {
    288                         pAttribStart[comp + v * 4] = constantValue;
    289                     }
    290                 }
    291             }
    292         }
    293     }
    294 }
    295 
    296 //////////////////////////////////////////////////////////////////////////
    297 /// @brief  Gather scissor rect data based on per-prim viewport indices.
    298 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
    299 /// @param pViewportIndex - array of per-primitive vewport indexes.
    300 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
    301 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
    302 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
    303 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
    304 //
    305 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
    306 template<size_t SimdWidth>
    307 struct GatherScissors
    308 {
    309     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
    310         simdscalari &scisXmin, simdscalari &scisYmin,
    311         simdscalari &scisXmax, simdscalari &scisYmax)
    312     {
    313         SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
    314     }
    315 };
    316 
    317 template<>
    318 struct GatherScissors<8>
    319 {
    320     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
    321         simdscalari &scisXmin, simdscalari &scisYmin,
    322         simdscalari &scisXmax, simdscalari &scisYmax)
    323     {
    324         scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
    325             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
    326             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
    327             pScissorsInFixedPoint[pViewportIndex[3]].xmin,
    328             pScissorsInFixedPoint[pViewportIndex[4]].xmin,
    329             pScissorsInFixedPoint[pViewportIndex[5]].xmin,
    330             pScissorsInFixedPoint[pViewportIndex[6]].xmin,
    331             pScissorsInFixedPoint[pViewportIndex[7]].xmin);
    332         scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
    333             pScissorsInFixedPoint[pViewportIndex[1]].ymin,
    334             pScissorsInFixedPoint[pViewportIndex[2]].ymin,
    335             pScissorsInFixedPoint[pViewportIndex[3]].ymin,
    336             pScissorsInFixedPoint[pViewportIndex[4]].ymin,
    337             pScissorsInFixedPoint[pViewportIndex[5]].ymin,
    338             pScissorsInFixedPoint[pViewportIndex[6]].ymin,
    339             pScissorsInFixedPoint[pViewportIndex[7]].ymin);
    340         scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
    341             pScissorsInFixedPoint[pViewportIndex[1]].xmax,
    342             pScissorsInFixedPoint[pViewportIndex[2]].xmax,
    343             pScissorsInFixedPoint[pViewportIndex[3]].xmax,
    344             pScissorsInFixedPoint[pViewportIndex[4]].xmax,
    345             pScissorsInFixedPoint[pViewportIndex[5]].xmax,
    346             pScissorsInFixedPoint[pViewportIndex[6]].xmax,
    347             pScissorsInFixedPoint[pViewportIndex[7]].xmax);
    348         scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
    349             pScissorsInFixedPoint[pViewportIndex[1]].ymax,
    350             pScissorsInFixedPoint[pViewportIndex[2]].ymax,
    351             pScissorsInFixedPoint[pViewportIndex[3]].ymax,
    352             pScissorsInFixedPoint[pViewportIndex[4]].ymax,
    353             pScissorsInFixedPoint[pViewportIndex[5]].ymax,
    354             pScissorsInFixedPoint[pViewportIndex[6]].ymax,
    355             pScissorsInFixedPoint[pViewportIndex[7]].ymax);
    356     }
    357 };
    358 
    359 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
    360 
    361 struct ProcessAttributesChooser
    362 {
    363     typedef PFN_PROCESS_ATTRIBUTES FuncType;
    364 
    365     template <typename... ArgsB>
    366     static FuncType GetFunc()
    367     {
    368         return ProcessAttributes<ArgsB...>;
    369     }
    370 };
    371 
    372 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
    373 {
    374     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
    375 }
    376 
    377 //////////////////////////////////////////////////////////////////////////
    378 /// @brief Processes enabled user clip distances. Loads the active clip
    379 ///        distances from the PA, sets up barycentric equations, and
    380 ///        stores the results to the output buffer
    381 /// @param pa - Primitive Assembly state
    382 /// @param primIndex - primitive index to process
    383 /// @param clipDistMask - mask of enabled clip distances
    384 /// @param pUserClipBuffer - buffer to store results
    385 template<uint32_t NumVerts>
    386 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
    387 {
    388     DWORD clipDist;
    389     while (_BitScanForward(&clipDist, clipDistMask))
    390     {
    391         clipDistMask &= ~(1 << clipDist);
    392         uint32_t clipSlot = clipDist >> 2;
    393         uint32_t clipComp = clipDist & 0x3;
    394         uint32_t clipAttribSlot = clipSlot == 0 ?
    395             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
    396 
    397         __m128 primClipDist[3];
    398         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
    399 
    400         float vertClipDist[NumVerts];
    401         for (uint32_t e = 0; e < NumVerts; ++e)
    402         {
    403             OSALIGNSIMD(float) aVertClipDist[4];
    404             _mm_store_ps(aVertClipDist, primClipDist[e]);
    405             vertClipDist[e] = aVertClipDist[clipComp];
    406         };
    407 
    408         // setup plane equations for barycentric interpolation in the backend
    409         float baryCoeff[NumVerts];
    410         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
    411         for (uint32_t e = 0; e < NumVerts - 1; ++e)
    412         {
    413             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
    414         }
    415         baryCoeff[NumVerts - 1] = last;
    416 
    417         for (uint32_t e = 0; e < NumVerts; ++e)
    418         {
    419             *(pUserClipBuffer++) = baryCoeff[e];
    420         }
    421     }
    422 }
    423 
    424 //////////////////////////////////////////////////////////////////////////
    425 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
    426 ///        culling, viewport transform, etc.
    427 /// @param pDC - pointer to draw context.
    428 /// @param pa - The primitive assembly object.
    429 /// @param workerId - thread's worker id. Even thread has a unique id.
    430 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
    431 /// @param primID - Primitive ID for each triangle.
    432 /// @param viewportIdx - viewport array index for each triangle.
    433 /// @tparam CT - ConservativeRastFETraits
    434 template <typename CT>
    435 void BinTriangles(
    436     DRAW_CONTEXT *pDC,
    437     PA_STATE& pa,
    438     uint32_t workerId,
    439     simdvector tri[3],
    440     uint32_t triMask,
    441     simdscalari primID,
    442     simdscalari viewportIdx)
    443 {
    444     SWR_CONTEXT *pContext = pDC->pContext;
    445 
    446     AR_BEGIN(FEBinTriangles, pDC->drawId);
    447 
    448     const API_STATE& state = GetApiState(pDC);
    449     const SWR_RASTSTATE& rastState = state.rastState;
    450     const SWR_FRONTEND_STATE& feState = state.frontendState;
    451     const SWR_GS_STATE& gsState = state.gsState;
    452     MacroTileMgr *pTileMgr = pDC->pTileMgr;
    453 
    454     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
    455     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
    456     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
    457 
    458     if (feState.vpTransformDisable)
    459     {
    460         // RHW is passed in directly when VP transform is disabled
    461         vRecipW0 = tri[0].v[3];
    462         vRecipW1 = tri[1].v[3];
    463         vRecipW2 = tri[2].v[3];
    464     }
    465     else
    466     {
    467         // Perspective divide
    468         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
    469         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
    470         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
    471 
    472         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
    473         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
    474         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
    475 
    476         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
    477         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
    478         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
    479 
    480         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
    481         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
    482         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
    483 
    484         // Viewport transform to screen space coords
    485         if (state.gsState.emitsViewportArrayIndex)
    486         {
    487             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
    488         }
    489         else
    490         {
    491             viewportTransform<3>(tri, state.vpMatrices);
    492         }
    493     }
    494 
    495     // Adjust for pixel center location
    496     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
    497     tri[0].x = _simd_add_ps(tri[0].x, offset);
    498     tri[0].y = _simd_add_ps(tri[0].y, offset);
    499 
    500     tri[1].x = _simd_add_ps(tri[1].x, offset);
    501     tri[1].y = _simd_add_ps(tri[1].y, offset);
    502 
    503     tri[2].x = _simd_add_ps(tri[2].x, offset);
    504     tri[2].y = _simd_add_ps(tri[2].y, offset);
    505 
    506     simdscalari vXi[3], vYi[3];
    507     // Set vXi, vYi to required fixed point precision
    508     FPToFixedPoint(tri, vXi, vYi);
    509 
    510     // triangle setup
    511     simdscalari vAi[3], vBi[3];
    512     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
    513 
    514     // determinant
    515     simdscalari vDet[2];
    516     calcDeterminantIntVertical(vAi, vBi, vDet);
    517 
    518     // cull zero area
    519     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
    520     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
    521 
    522     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
    523 
    524     uint32_t origTriMask = triMask;
    525     // don't cull degenerate triangles if we're conservatively rasterizing
    526     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
    527     {
    528         triMask &= ~cullZeroAreaMask;
    529     }
    530 
    531     // determine front winding tris
    532     // CW  +det
    533     // CCW det < 0;
    534     // 0 area triangles are marked as backfacing regardless of winding order,
    535     // which is required behavior for conservative rast and wireframe rendering
    536     uint32_t frontWindingTris;
    537     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
    538     {
    539         maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
    540         maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
    541     }
    542     else
    543     {
    544         maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
    545         maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
    546     }
    547     frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
    548 
    549     // cull
    550     uint32_t cullTris;
    551     switch ((SWR_CULLMODE)rastState.cullMode)
    552     {
    553     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
    554     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
    555     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
    556         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
    557     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
    558     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
    559     }
    560 
    561     triMask &= ~cullTris;
    562 
    563     if (origTriMask ^ triMask)
    564     {
    565         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
    566     }
    567 
    568     // Simple non-conformant wireframe mode, useful for debugging
    569     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
    570     {
    571         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
    572         simdvector line[2];
    573         simdscalar recipW[2];
    574         line[0] = tri[0];
    575         line[1] = tri[1];
    576         recipW[0] = vRecipW0;
    577         recipW[1] = vRecipW1;
    578         BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
    579 
    580         line[0] = tri[1];
    581         line[1] = tri[2];
    582         recipW[0] = vRecipW1;
    583         recipW[1] = vRecipW2;
    584         BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
    585 
    586         line[0] = tri[2];
    587         line[1] = tri[0];
    588         recipW[0] = vRecipW2;
    589         recipW[1] = vRecipW0;
    590         BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
    591 
    592         AR_END(FEBinTriangles, 1);
    593         return;
    594     }
    595 
    596     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
    597     // compute per tri backface
    598     uint32_t frontFaceMask = frontWindingTris;
    599     uint32_t *pPrimID = (uint32_t *)&primID;
    600     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
    601     DWORD triIndex = 0;
    602     // for center sample pattern, all samples are at pixel center; calculate coverage
    603     // once at center and broadcast the results in the backend
    604     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
    605     uint32_t edgeEnable;
    606     PFN_WORK_FUNC pfnWork;
    607     if (CT::IsConservativeT::value)
    608     {
    609         // determine which edges of the degenerate tri, if any, are valid to rasterize.
    610         // used to call the appropriate templated rasterizer function
    611         if (cullZeroAreaMask > 0)
    612         {
    613             // e0 = v1-v0
    614             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
    615             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
    616             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
    617 
    618             // e1 = v2-v1
    619             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
    620             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
    621             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
    622 
    623             // e2 = v0-v2
    624             // if v0 == v1 & v1 == v2, v0 == v2
    625             uint32_t e2Mask = e0Mask & e1Mask;
    626             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
    627 
    628             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
    629             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
    630             e0Mask = pdep_u32(e0Mask, 0x00249249);
    631             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
    632             e1Mask = pdep_u32(e1Mask, 0x00492492);
    633             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
    634             e2Mask = pdep_u32(e2Mask, 0x00924924);
    635 
    636             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
    637         }
    638         else
    639         {
    640             edgeEnable = 0x00FFFFFF;
    641         }
    642     }
    643     else
    644     {
    645         // degenerate triangles won't be sent to rasterizer; just enable all edges
    646         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
    647             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
    648             (state.scissorsTileAligned == false));
    649     }
    650 
    651     if (!triMask)
    652     {
    653         goto endBinTriangles;
    654     }
    655 
    656     // Calc bounding box of triangles
    657     simdBBox bbox;
    658     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
    659 
    660     // determine if triangle falls between pixel centers and discard
    661     // only discard for non-MSAA case and when conservative rast is disabled
    662     // (xmin + 127) & ~255
    663     // (xmax + 128) & ~255
    664     if (rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
    665     {
    666         origTriMask = triMask;
    667 
    668         int cullCenterMask;
    669         {
    670             simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
    671             xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
    672             simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
    673             xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
    674 
    675             simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
    676 
    677             simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
    678             ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
    679             simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
    680             ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
    681 
    682             simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
    683             vMaskV = _simd_or_si(vMaskH, vMaskV);
    684             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
    685         }
    686 
    687         triMask &= ~cullCenterMask;
    688 
    689         if (origTriMask ^ triMask)
    690         {
    691             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
    692         }
    693     }
    694 
    695     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
    696     // Gather the AOS effective scissor rects based on the per-prim VP index.
    697     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
    698     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
    699     if (state.gsState.emitsViewportArrayIndex)
    700     {
    701         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
    702             scisXmin, scisYmin, scisXmax, scisYmax);
    703     }
    704     else // broadcast fast path for non-VPAI case.
    705     {
    706         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
    707         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
    708         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
    709         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
    710     }
    711 
    712     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
    713     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
    714     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
    715     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
    716 
    717     if (CT::IsConservativeT::value)
    718     {
    719         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
    720         // some area. Bump the xmax/ymax edges out
    721         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
    722         bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
    723         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
    724         bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
    725     }
    726 
    727     // Cull tris completely outside scissor
    728     {
    729         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
    730         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
    731         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
    732         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
    733         triMask = triMask & ~maskOutsideScissor;
    734     }
    735 
    736     if (!triMask)
    737     {
    738         goto endBinTriangles;
    739     }
    740 
    741     // Convert triangle bbox to macrotile units.
    742     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
    743     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
    744     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
    745     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
    746 
    747     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
    748     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
    749     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
    750     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
    751     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
    752 
    753     // transpose verts needed for backend
    754     /// @todo modify BE to take non-transformed verts
    755     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
    756     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
    757     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
    758     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
    759     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
    760 
    761     // store render target array index
    762     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
    763     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
    764     {
    765         simdvector vRtai[3];
    766         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
    767         simdscalari vRtaii;
    768         vRtaii = _simd_castps_si(vRtai[0].x);
    769         _simd_store_si((simdscalari*)aRTAI, vRtaii);
    770     }
    771     else
    772     {
    773         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
    774     }
    775 
    776 endBinTriangles:
    777 
    778     // scan remaining valid triangles and bin each separately
    779     while (_BitScanForward(&triIndex, triMask))
    780     {
    781         uint32_t linkageCount = state.backendState.numAttributes;
    782         uint32_t numScalarAttribs = linkageCount * 4;
    783 
    784         BE_WORK work;
    785         work.type = DRAW;
    786 
    787         bool isDegenerate;
    788         if (CT::IsConservativeT::value)
    789         {
    790             // only rasterize valid edges if we have a degenerate primitive
    791             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
    792             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
    793                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
    794                 (state.scissorsTileAligned == false));
    795 
    796             // Degenerate triangles are required to be constant interpolated
    797             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
    798         }
    799         else
    800         {
    801             isDegenerate = false;
    802             work.pfnWork = pfnWork;
    803         }
    804 
    805         // Select attribute processor
    806         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
    807             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
    808 
    809         TRIANGLE_WORK_DESC &desc = work.desc.tri;
    810 
    811         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
    812         desc.triFlags.primID = pPrimID[triIndex];
    813         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
    814         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
    815 
    816         auto pArena = pDC->pArena;
    817         SWR_ASSERT(pArena != nullptr);
    818 
    819         // store active attribs
    820         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
    821         desc.pAttribs = pAttribs;
    822         desc.numAttribs = linkageCount;
    823         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
    824 
    825         // store triangle vertex data
    826         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
    827 
    828         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
    829         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
    830         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
    831         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
    832 
    833         // store user clip distances
    834         if (rastState.clipDistanceMask)
    835         {
    836             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
    837             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
    838             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
    839         }
    840 
    841         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
    842         {
    843             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
    844             {
    845 #if KNOB_ENABLE_TOSS_POINTS
    846                 if (!KNOB_TOSS_SETUP_TRIS)
    847 #endif
    848                 {
    849                     pTileMgr->enqueue(x, y, &work);
    850                 }
    851             }
    852         }
    853                      triMask &= ~(1 << triIndex);
    854     }
    855 
    856     AR_END(FEBinTriangles, 1);
    857 }
    858 
    859 struct FEBinTrianglesChooser
    860 {
    861     typedef PFN_PROCESS_PRIMS FuncType;
    862 
    863     template <typename... ArgsB>
    864     static FuncType GetFunc()
    865     {
    866         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
    867     }
    868 };
    869 
    870 // Selector for correct templated BinTrinagles function
    871 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
    872 {
    873     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
    874 }
    875 
    876 
    877 //////////////////////////////////////////////////////////////////////////
    878 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
    879 /// @param pDC - pointer to draw context.
    880 /// @param pa - The primitive assembly object.
    881 /// @param workerId - thread's worker id. Even thread has a unique id.
    882 /// @param tri - Contains point position data for SIMDs worth of points.
    883 /// @param primID - Primitive ID for each point.
    884 void BinPoints(
    885     DRAW_CONTEXT *pDC,
    886     PA_STATE& pa,
    887     uint32_t workerId,
    888     simdvector prim[3],
    889     uint32_t primMask,
    890     simdscalari primID,
    891     simdscalari viewportIdx)
    892 {
    893     SWR_CONTEXT *pContext = pDC->pContext;
    894 
    895     AR_BEGIN(FEBinPoints, pDC->drawId);
    896 
    897     simdvector& primVerts = prim[0];
    898 
    899     const API_STATE& state = GetApiState(pDC);
    900     const SWR_FRONTEND_STATE& feState = state.frontendState;
    901     const SWR_GS_STATE& gsState = state.gsState;
    902     const SWR_RASTSTATE& rastState = state.rastState;
    903     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
    904 
    905     // Select attribute processor
    906     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
    907         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
    908 
    909     if (!feState.vpTransformDisable)
    910     {
    911         // perspective divide
    912         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
    913         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
    914         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
    915         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
    916 
    917         // viewport transform to screen coords
    918         if (state.gsState.emitsViewportArrayIndex)
    919         {
    920             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
    921         }
    922         else
    923         {
    924             viewportTransform<1>(&primVerts, state.vpMatrices);
    925         }
    926     }
    927 
    928     // adjust for pixel center location
    929     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
    930     primVerts.x = _simd_add_ps(primVerts.x, offset);
    931     primVerts.y = _simd_add_ps(primVerts.y, offset);
    932 
    933     // convert to fixed point
    934     simdscalari vXi, vYi;
    935     vXi = fpToFixedPointVertical(primVerts.x);
    936     vYi = fpToFixedPointVertical(primVerts.y);
    937 
    938     if (CanUseSimplePoints(pDC))
    939     {
    940         // adjust for ymin-xmin rule
    941         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
    942         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
    943 
    944         // cull points off the ymin-xmin edge of the viewport
    945         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
    946         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
    947 
    948         // compute macro tile coordinates
    949         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
    950         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
    951 
    952         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
    953         _simd_store_si((simdscalari*)aMacroX, macroX);
    954         _simd_store_si((simdscalari*)aMacroY, macroY);
    955 
    956         // compute raster tile coordinates
    957         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
    958         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
    959 
    960         // compute raster tile relative x,y for coverage mask
    961         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
    962         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
    963 
    964         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
    965         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
    966 
    967         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
    968         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
    969         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
    970         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
    971 
    972         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
    973         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
    974         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
    975         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
    976 
    977         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
    978         _simd_store_ps((float*)aZ, primVerts.z);
    979 
    980         // store render target array index
    981         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
    982         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
    983         {
    984             simdvector vRtai;
    985             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
    986             simdscalari vRtaii = _simd_castps_si(vRtai.x);
    987             _simd_store_si((simdscalari*)aRTAI, vRtaii);
    988         }
    989         else
    990         {
    991             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
    992         }
    993 
    994         uint32_t *pPrimID = (uint32_t *)&primID;
    995         DWORD primIndex = 0;
    996 
    997         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
    998 
    999         // scan remaining valid triangles and bin each separately
   1000         while (_BitScanForward(&primIndex, primMask))
   1001         {
   1002             uint32_t linkageCount = backendState.numAttributes;
   1003             uint32_t numScalarAttribs = linkageCount * 4;
   1004 
   1005             BE_WORK work;
   1006             work.type = DRAW;
   1007 
   1008             TRIANGLE_WORK_DESC &desc = work.desc.tri;
   1009 
   1010             // points are always front facing
   1011             desc.triFlags.frontFacing = 1;
   1012             desc.triFlags.primID = pPrimID[primIndex];
   1013             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
   1014             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
   1015 
   1016             work.pfnWork = RasterizeSimplePoint;
   1017 
   1018             auto pArena = pDC->pArena;
   1019             SWR_ASSERT(pArena != nullptr);
   1020 
   1021             // store attributes
   1022             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
   1023             desc.pAttribs = pAttribs;
   1024             desc.numAttribs = linkageCount;
   1025 
   1026             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
   1027 
   1028             // store raster tile aligned x, y, perspective correct z
   1029             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
   1030             desc.pTriBuffer = pTriBuffer;
   1031             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
   1032             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
   1033             *pTriBuffer = aZ[primIndex];
   1034 
   1035             uint32_t tX = aTileRelativeX[primIndex];
   1036             uint32_t tY = aTileRelativeY[primIndex];
   1037 
   1038             // pack the relative x,y into the coverageMask, the rasterizer will
   1039             // generate the true coverage mask from it
   1040             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
   1041 
   1042             // bin it
   1043             MacroTileMgr *pTileMgr = pDC->pTileMgr;
   1044 #if KNOB_ENABLE_TOSS_POINTS
   1045             if (!KNOB_TOSS_SETUP_TRIS)
   1046 #endif
   1047             {
   1048                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
   1049             }
   1050             primMask &= ~(1 << primIndex);
   1051         }
   1052     }
   1053     else
   1054     {
   1055         // non simple points need to be potentially binned to multiple macro tiles
   1056         simdscalar vPointSize;
   1057         if (rastState.pointParam)
   1058         {
   1059             simdvector size[3];
   1060             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
   1061             vPointSize = size[0].x;
   1062         }
   1063         else
   1064         {
   1065             vPointSize = _simd_set1_ps(rastState.pointSize);
   1066         }
   1067 
   1068         // bloat point to bbox
   1069         simdBBox bbox;
   1070         bbox.xmin = bbox.xmax = vXi;
   1071         bbox.ymin = bbox.ymax = vYi;
   1072 
   1073         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
   1074         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
   1075         bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
   1076         bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
   1077         bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
   1078         bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
   1079 
   1080         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
   1081         // Gather the AOS effective scissor rects based on the per-prim VP index.
   1082         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
   1083         simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
   1084         if (state.gsState.emitsViewportArrayIndex)
   1085         {
   1086             GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
   1087                 scisXmin, scisYmin, scisXmax, scisYmax);
   1088         }
   1089         else // broadcast fast path for non-VPAI case.
   1090         {
   1091             scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
   1092             scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
   1093             scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
   1094             scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
   1095         }
   1096 
   1097         bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
   1098         bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
   1099         bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
   1100         bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
   1101 
   1102         // Cull bloated points completely outside scissor
   1103         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
   1104         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
   1105         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
   1106         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
   1107         primMask = primMask & ~maskOutsideScissor;
   1108 
   1109         // Convert bbox to macrotile units.
   1110         bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
   1111         bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
   1112         bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
   1113         bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
   1114 
   1115         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
   1116         _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
   1117         _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
   1118         _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
   1119         _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
   1120 
   1121         // store render target array index
   1122         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
   1123         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
   1124         {
   1125             simdvector vRtai[2];
   1126             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
   1127             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
   1128             _simd_store_si((simdscalari*)aRTAI, vRtaii);
   1129         }
   1130         else
   1131         {
   1132             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
   1133         }
   1134 
   1135         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
   1136         _simd_store_ps((float*)aPointSize, vPointSize);
   1137 
   1138         uint32_t *pPrimID = (uint32_t *)&primID;
   1139 
   1140         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
   1141         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
   1142         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
   1143 
   1144         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
   1145         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
   1146         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
   1147 
   1148         // scan remaining valid prims and bin each separately
   1149         const SWR_BACKEND_STATE& backendState = state.backendState;
   1150         DWORD primIndex;
   1151         while (_BitScanForward(&primIndex, primMask))
   1152         {
   1153             uint32_t linkageCount = backendState.numAttributes;
   1154             uint32_t numScalarAttribs = linkageCount * 4;
   1155 
   1156             BE_WORK work;
   1157             work.type = DRAW;
   1158 
   1159             TRIANGLE_WORK_DESC &desc = work.desc.tri;
   1160 
   1161             desc.triFlags.frontFacing = 1;
   1162             desc.triFlags.primID = pPrimID[primIndex];
   1163             desc.triFlags.pointSize = aPointSize[primIndex];
   1164             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
   1165             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
   1166 
   1167             work.pfnWork = RasterizeTriPoint;
   1168 
   1169             auto pArena = pDC->pArena;
   1170             SWR_ASSERT(pArena != nullptr);
   1171 
   1172             // store active attribs
   1173             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
   1174             desc.numAttribs = linkageCount;
   1175             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
   1176 
   1177             // store point vertex data
   1178             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
   1179             desc.pTriBuffer = pTriBuffer;
   1180             *pTriBuffer++ = aPrimVertsX[primIndex];
   1181             *pTriBuffer++ = aPrimVertsY[primIndex];
   1182             *pTriBuffer = aPrimVertsZ[primIndex];
   1183 
   1184             // store user clip distances
   1185             if (rastState.clipDistanceMask)
   1186             {
   1187                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
   1188                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
   1189                 float dists[8];
   1190                 float one = 1.0f;
   1191                 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
   1192                 for (uint32_t i = 0; i < numClipDist; i++) {
   1193                     desc.pUserClipBuffer[3*i + 0] = 0.0f;
   1194                     desc.pUserClipBuffer[3*i + 1] = 0.0f;
   1195                     desc.pUserClipBuffer[3*i + 2] = dists[i];
   1196                 }
   1197             }
   1198 
   1199             MacroTileMgr *pTileMgr = pDC->pTileMgr;
   1200             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
   1201             {
   1202                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
   1203                 {
   1204 #if KNOB_ENABLE_TOSS_POINTS
   1205                     if (!KNOB_TOSS_SETUP_TRIS)
   1206 #endif
   1207                     {
   1208                         pTileMgr->enqueue(x, y, &work);
   1209                     }
   1210                 }
   1211             }
   1212 
   1213             primMask &= ~(1 << primIndex);
   1214         }
   1215     }
   1216 
   1217     AR_END(FEBinPoints, 1);
   1218 }
   1219 
   1220 //////////////////////////////////////////////////////////////////////////
   1221 /// @brief Bin SIMD lines to the backend.
   1222 /// @param pDC - pointer to draw context.
   1223 /// @param pa - The primitive assembly object.
   1224 /// @param workerId - thread's worker id. Even thread has a unique id.
   1225 /// @param tri - Contains line position data for SIMDs worth of points.
   1226 /// @param primID - Primitive ID for each line.
   1227 /// @param viewportIdx - Viewport Array Index for each line.
   1228 void BinPostSetupLines(
   1229     DRAW_CONTEXT *pDC,
   1230     PA_STATE& pa,
   1231     uint32_t workerId,
   1232     simdvector prim[],
   1233     simdscalar recipW[],
   1234     uint32_t primMask,
   1235     simdscalari primID,
   1236     simdscalari viewportIdx)
   1237 {
   1238     SWR_CONTEXT *pContext = pDC->pContext;
   1239 
   1240     AR_BEGIN(FEBinLines, pDC->drawId);
   1241 
   1242     const API_STATE& state = GetApiState(pDC);
   1243     const SWR_RASTSTATE& rastState = state.rastState;
   1244     const SWR_FRONTEND_STATE& feState = state.frontendState;
   1245     const SWR_GS_STATE& gsState = state.gsState;
   1246 
   1247     // Select attribute processor
   1248     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
   1249         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
   1250 
   1251     simdscalar& vRecipW0 = recipW[0];
   1252     simdscalar& vRecipW1 = recipW[1];
   1253 
   1254     // convert to fixed point
   1255     simdscalari vXi[2], vYi[2];
   1256     vXi[0] = fpToFixedPointVertical(prim[0].x);
   1257     vYi[0] = fpToFixedPointVertical(prim[0].y);
   1258     vXi[1] = fpToFixedPointVertical(prim[1].x);
   1259     vYi[1] = fpToFixedPointVertical(prim[1].y);
   1260 
   1261     // compute x-major vs y-major mask
   1262     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
   1263     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
   1264     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
   1265     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
   1266 
   1267     // cull zero-length lines
   1268     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
   1269     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
   1270 
   1271     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
   1272 
   1273     uint32_t *pPrimID = (uint32_t *)&primID;
   1274     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
   1275 
   1276     simdscalar vUnused = _simd_setzero_ps();
   1277 
   1278     // Calc bounding box of lines
   1279     simdBBox bbox;
   1280     bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
   1281     bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
   1282     bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
   1283     bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
   1284 
   1285     // bloat bbox by line width along minor axis
   1286     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
   1287     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
   1288     simdBBox bloatBox;
   1289     bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
   1290     bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
   1291     bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
   1292     bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
   1293 
   1294     bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
   1295     bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
   1296     bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
   1297     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
   1298 
   1299     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
   1300     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
   1301     if (state.gsState.emitsViewportArrayIndex)
   1302     {
   1303         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
   1304             scisXmin, scisYmin, scisXmax, scisYmax);
   1305     }
   1306     else // broadcast fast path for non-VPAI case.
   1307     {
   1308         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
   1309         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
   1310         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
   1311         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
   1312     }
   1313 
   1314     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
   1315     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
   1316     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
   1317     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
   1318 
   1319     // Cull prims completely outside scissor
   1320     {
   1321         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
   1322         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
   1323         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
   1324         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
   1325         primMask = primMask & ~maskOutsideScissor;
   1326     }
   1327 
   1328     if (!primMask)
   1329     {
   1330         goto endBinLines;
   1331     }
   1332 
   1333     // Convert triangle bbox to macrotile units.
   1334     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
   1335     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
   1336     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
   1337     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
   1338 
   1339     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
   1340     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
   1341     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
   1342     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
   1343     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
   1344 
   1345     // transpose verts needed for backend
   1346     /// @todo modify BE to take non-transformed verts
   1347     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
   1348     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
   1349     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
   1350     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
   1351     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
   1352 
   1353     // store render target array index
   1354     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
   1355     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
   1356     {
   1357         simdvector vRtai[2];
   1358         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
   1359         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
   1360         _simd_store_si((simdscalari*)aRTAI, vRtaii);
   1361     }
   1362     else
   1363     {
   1364         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
   1365     }
   1366 
   1367     // scan remaining valid prims and bin each separately
   1368     DWORD primIndex;
   1369     while (_BitScanForward(&primIndex, primMask))
   1370     {
   1371         uint32_t linkageCount = state.backendState.numAttributes;
   1372         uint32_t numScalarAttribs = linkageCount * 4;
   1373 
   1374         BE_WORK work;
   1375         work.type = DRAW;
   1376 
   1377         TRIANGLE_WORK_DESC &desc = work.desc.tri;
   1378 
   1379         desc.triFlags.frontFacing = 1;
   1380         desc.triFlags.primID = pPrimID[primIndex];
   1381         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
   1382         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
   1383         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
   1384 
   1385         work.pfnWork = RasterizeLine;
   1386 
   1387         auto pArena = pDC->pArena;
   1388         SWR_ASSERT(pArena != nullptr);
   1389 
   1390         // store active attribs
   1391         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
   1392         desc.numAttribs = linkageCount;
   1393         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
   1394 
   1395         // store line vertex data
   1396         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
   1397         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
   1398         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
   1399         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
   1400         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
   1401 
   1402         // store user clip distances
   1403         if (rastState.clipDistanceMask)
   1404         {
   1405             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
   1406             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
   1407             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
   1408         }
   1409 
   1410         MacroTileMgr *pTileMgr = pDC->pTileMgr;
   1411         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
   1412         {
   1413             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
   1414             {
   1415 #if KNOB_ENABLE_TOSS_POINTS
   1416                 if (!KNOB_TOSS_SETUP_TRIS)
   1417 #endif
   1418                 {
   1419                     pTileMgr->enqueue(x, y, &work);
   1420                 }
   1421             }
   1422         }
   1423 
   1424         primMask &= ~(1 << primIndex);
   1425     }
   1426 
   1427 endBinLines:
   1428 
   1429     AR_END(FEBinLines, 1);
   1430 }
   1431 
   1432 //////////////////////////////////////////////////////////////////////////
   1433 /// @brief Bin SIMD lines to the backend.
   1434 /// @param pDC - pointer to draw context.
   1435 /// @param pa - The primitive assembly object.
   1436 /// @param workerId - thread's worker id. Even thread has a unique id.
   1437 /// @param tri - Contains line position data for SIMDs worth of points.
   1438 /// @param primID - Primitive ID for each line.
   1439 /// @param viewportIdx - Viewport Array Index for each line.
   1440 void BinLines(
   1441     DRAW_CONTEXT *pDC,
   1442     PA_STATE& pa,
   1443     uint32_t workerId,
   1444     simdvector prim[],
   1445     uint32_t primMask,
   1446     simdscalari primID,
   1447     simdscalari viewportIdx)
   1448 {
   1449     SWR_CONTEXT *pContext = pDC->pContext;
   1450 
   1451     const API_STATE& state = GetApiState(pDC);
   1452     const SWR_RASTSTATE& rastState = state.rastState;
   1453     const SWR_FRONTEND_STATE& feState = state.frontendState;
   1454     const SWR_GS_STATE& gsState = state.gsState;
   1455 
   1456     // Select attribute processor
   1457     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
   1458         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
   1459 
   1460     simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
   1461 
   1462     if (!feState.vpTransformDisable)
   1463     {
   1464         // perspective divide
   1465         vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
   1466         vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
   1467 
   1468         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
   1469         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
   1470 
   1471         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
   1472         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
   1473 
   1474         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
   1475         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
   1476 
   1477         // viewport transform to screen coords
   1478         if (state.gsState.emitsViewportArrayIndex)
   1479         {
   1480             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
   1481         }
   1482         else
   1483         {
   1484             viewportTransform<2>(prim, state.vpMatrices);
   1485         }
   1486     }
   1487 
   1488     // adjust for pixel center location
   1489     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
   1490     prim[0].x = _simd_add_ps(prim[0].x, offset);
   1491     prim[0].y = _simd_add_ps(prim[0].y, offset);
   1492 
   1493     prim[1].x = _simd_add_ps(prim[1].x, offset);
   1494     prim[1].y = _simd_add_ps(prim[1].y, offset);
   1495 
   1496     BinPostSetupLines(
   1497         pDC,
   1498         pa,
   1499         workerId,
   1500         prim,
   1501         vRecipW,
   1502         primMask,
   1503         primID,
   1504         viewportIdx);
   1505 }
   1506