Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file clip.h
     24 *
     25 * @brief Definitions for clipping
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include "common/simdintrin.h"
     31 #include "core/context.h"
     32 #include "core/pa.h"
     33 #include "rdtsc_core.h"
     34 
     35 // Temp storage used by the clipper
     36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
     37 #if USE_SIMD16_FRONTEND
     38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
     39 #endif
     40 
     41 enum SWR_CLIPCODES
     42 {
     43     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
     44     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
     45 #define CLIPCODE_SHIFT 23
     46     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
     47     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
     48     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
     49     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
     50 
     51     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
     52     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
     53 
     54     NEGW            = (0x40 << CLIPCODE_SHIFT),
     55 
     56     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
     57     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
     58     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
     59     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
     60 };
     61 
     62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
     63 
     64 template<typename SIMD_T>
     65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
     66 {
     67     clipCodes = SIMD_T::setzero_ps();
     68 
     69     // -w
     70     typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
     71 
     72     // FRUSTUM_LEFT
     73     typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
     74     clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
     75 
     76     // FRUSTUM_TOP
     77     vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
     78     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
     79 
     80     // FRUSTUM_RIGHT
     81     vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
     82     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
     83 
     84     // FRUSTUM_BOTTOM
     85     vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
     86     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
     87 
     88     if (state.rastState.depthClipEnable)
     89     {
     90         // FRUSTUM_NEAR
     91         // DX clips depth [0..w], GL clips [-w..w]
     92         if (state.rastState.clipHalfZ)
     93         {
     94             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
     95         }
     96         else
     97         {
     98             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
     99         }
    100         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
    101 
    102         // FRUSTUM_FAR
    103         vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
    104         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
    105     }
    106 
    107     // NEGW
    108     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
    109     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
    110 
    111     // GUARDBAND_LEFT
    112     typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
    113     vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
    114     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
    115 
    116     // GUARDBAND_TOP
    117     gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
    118     vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
    119     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
    120 
    121     // GUARDBAND_RIGHT
    122     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
    123     vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
    124     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
    125 
    126     // GUARDBAND_BOTTOM
    127     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
    128     vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
    129     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
    130 }
    131 
    132 template<typename SIMD_T>
    133 struct BinnerChooser
    134 {
    135 };
    136 
    137 template<>
    138 struct BinnerChooser<SIMD256>
    139 {
    140     PFN_PROCESS_PRIMS pfnBinFunc;
    141 
    142     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
    143         :pfnBinFunc(nullptr)
    144     {
    145         if (numVertsPerPrim == 3)
    146         {
    147             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
    148 
    149         }
    150         else if (numVertsPerPrim == 2)
    151         {
    152             pfnBinFunc = BinLines;
    153         }
    154         else
    155         {
    156             SWR_ASSERT(0 && "Unexpected points in clipper.");
    157         }
    158     }
    159 
    160     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
    161         :pfnBinFunc(nullptr)
    162     {
    163         switch (topology)
    164         {
    165         case TOP_POINT_LIST:
    166             pfnBinFunc = BinPoints;
    167             break;
    168         case TOP_LINE_LIST:
    169         case TOP_LINE_STRIP:
    170         case TOP_LINE_LOOP:
    171         case TOP_LINE_LIST_ADJ:
    172         case TOP_LISTSTRIP_ADJ:
    173             pfnBinFunc = BinLines;
    174             break;
    175         default:
    176             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
    177             break;
    178         };
    179     }
    180 
    181     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
    182     {
    183         SWR_ASSERT(pfnBinFunc != nullptr);
    184 
    185         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
    186     }
    187 };
    188 
    189 #if USE_SIMD16_FRONTEND
    190 template<>
    191 struct BinnerChooser<SIMD512>
    192 {
    193     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
    194 
    195     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
    196         :pfnBinFunc(nullptr)
    197     {
    198         if (numVertsPerPrim == 3)
    199         {
    200             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
    201 
    202         }
    203         else if (numVertsPerPrim == 2)
    204         {
    205             pfnBinFunc = BinLines_simd16;
    206         }
    207         else
    208         {
    209             SWR_ASSERT(0 && "Unexpected points in clipper.");
    210         }
    211     }
    212 
    213     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
    214         :pfnBinFunc(nullptr)
    215     {
    216         switch (topology)
    217         {
    218         case TOP_POINT_LIST:
    219             pfnBinFunc = BinPoints_simd16;
    220             break;
    221         case TOP_LINE_LIST:
    222         case TOP_LINE_STRIP:
    223         case TOP_LINE_LOOP:
    224         case TOP_LINE_LIST_ADJ:
    225         case TOP_LISTSTRIP_ADJ:
    226             pfnBinFunc = BinLines_simd16;
    227             break;
    228         default:
    229             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
    230             break;
    231         };
    232     }
    233 
    234     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
    235     {
    236         SWR_ASSERT(pfnBinFunc != nullptr);
    237 
    238         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
    239     }
    240 };
    241 
    242 #endif
    243 template<typename SIMD_T>
    244 struct SimdHelper
    245 {
    246 };
    247 
    248 template<>
    249 struct SimdHelper<SIMD256>
    250 {
    251     static SIMD256::Float insert_lo_ps(SIMD256::Float a)
    252     {
    253         return a;
    254     }
    255 
    256     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
    257     {
    258         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
    259     }
    260 };
    261 
    262 #if USE_SIMD16_FRONTEND
    263 template<>
    264 struct SimdHelper<SIMD512>
    265 {
    266     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
    267     {
    268         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
    269     }
    270 
    271     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
    272     {
    273         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
    274     }
    275 };
    276 
    277 #endif
    278 // Temp storage used by the clipper
    279 template<typename SIMD_T>
    280 struct ClipHelper
    281 {
    282 };
    283 
    284 template<>
    285 struct ClipHelper<SIMD256>
    286 {
    287     static SIMDVERTEX_T<SIMD256> *GetTempVertices()
    288     {
    289         return tlsTempVertices;
    290     }
    291 };
    292 
    293 #if USE_SIMD16_FRONTEND
    294 template<>
    295 struct ClipHelper<SIMD512>
    296 {
    297     static SIMDVERTEX_T<SIMD512> *GetTempVertices()
    298     {
    299         return tlsTempVertices_simd16;
    300     }
    301 };
    302 
    303 #endif
    304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
    305 class Clipper
    306 {
    307 public:
    308     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
    309         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
    310     {
    311         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
    312     }
    313 
    314     void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
    315     {
    316         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    317         {
    318             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
    319         }
    320     }
    321 
    322     typename SIMD_T::Float ComputeClipCodeIntersection()
    323     {
    324         typename SIMD_T::Float result = clipCodes[0];
    325 
    326         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
    327         {
    328             result = SIMD_T::and_ps(result, clipCodes[i]);
    329         }
    330 
    331         return result;
    332     }
    333 
    334     typename SIMD_T::Float ComputeClipCodeUnion()
    335     {
    336         typename SIMD_T::Float result = clipCodes[0];
    337 
    338         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
    339         {
    340             result = SIMD_T::or_ps(result, clipCodes[i]);
    341         }
    342 
    343         return result;
    344     }
    345 
    346     int ComputeClipMask()
    347     {
    348         typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
    349 
    350         clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
    351 
    352         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
    353     }
    354 
    355     // clipper is responsible for culling any prims with NAN coordinates
    356     int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
    357     {
    358         typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
    359 
    360         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
    361         {
    362             typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
    363             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
    364 
    365             typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
    366             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
    367         }
    368 
    369         return SIMD_T::movemask_ps(vNanMask);
    370     }
    371 
    372     int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
    373     {
    374         uint8_t cullMask = state.backendState.cullDistanceMask;
    375         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
    376 
    377         typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
    378 
    379         typename SIMD_T::Vec4 vClipCullDistLo[3];
    380         typename SIMD_T::Vec4 vClipCullDistHi[3];
    381 
    382         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
    383         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
    384 
    385         DWORD index;
    386         while (_BitScanForward(&index, cullMask))
    387         {
    388             cullMask &= ~(1 << index);
    389             uint32_t slot = index >> 2;
    390             uint32_t component = index & 0x3;
    391 
    392             typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
    393             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
    394             {
    395                 typename SIMD_T::Float vCullComp;
    396                 if (slot == 0)
    397                 {
    398                     vCullComp = vClipCullDistLo[e][component];
    399                 }
    400                 else
    401                 {
    402                     vCullComp = vClipCullDistHi[e][component];
    403                 }
    404 
    405                 // cull if cull distance < 0 || NAN
    406                 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
    407                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
    408             }
    409             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
    410         }
    411 
    412         // clipper should also discard any primitive with NAN clip distance
    413         uint8_t clipMask = state.backendState.clipDistanceMask;
    414         while (_BitScanForward(&index, clipMask))
    415         {
    416             clipMask &= ~(1 << index);
    417             uint32_t slot = index >> 2;
    418             uint32_t component = index & 0x3;
    419 
    420             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
    421             {
    422                 typename SIMD_T::Float vClipComp;
    423                 if (slot == 0)
    424                 {
    425                     vClipComp = vClipCullDistLo[e][component];
    426                 }
    427                 else
    428                 {
    429                     vClipComp = vClipCullDistHi[e][component];
    430                 }
    431 
    432                 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
    433                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
    434             }
    435         }
    436 
    437         return SIMD_T::movemask_ps(vClipCullMask);
    438     }
    439 
    440     void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
    441                   const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
    442     {
    443         // input/output vertex store for clipper
    444         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
    445 
    446         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
    447         uint32_t provokingVertex = 0;
    448         if (pa.binTopology == TOP_TRIANGLE_FAN)
    449         {
    450             provokingVertex = state.frontendState.provokingVertex.triFan;
    451         }
    452         ///@todo: line topology for wireframe?
    453 
    454         // assemble pos
    455         typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
    456         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    457         {
    458             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
    459         }
    460 
    461         // assemble attribs
    462         const SWR_BACKEND_STATE& backendState = state.backendState;
    463 
    464         int32_t maxSlot = -1;
    465         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
    466         {
    467             // Compute absolute attrib slot in vertex array
    468             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
    469             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
    470             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
    471 
    472             pa.Assemble(inputSlot, tmpVector);
    473 
    474             // if constant interpolation enabled for this attribute, assign the provoking
    475             // vertex values to all edges
    476             if (CheckBit(constantInterpMask, slot))
    477             {
    478                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    479                 {
    480                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
    481                 }
    482             }
    483             else
    484             {
    485                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    486                 {
    487                     vertices[i].attrib[inputSlot] = tmpVector[i];
    488                 }
    489             }
    490         }
    491 
    492         // assemble user clip distances if enabled
    493         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
    494         if (state.backendState.clipDistanceMask & 0xf)
    495         {
    496             pa.Assemble(vertexClipCullSlot, tmpVector);
    497             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    498             {
    499                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
    500             }
    501         }
    502 
    503         if (state.backendState.clipDistanceMask & 0xf0)
    504         {
    505             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
    506             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    507             {
    508                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
    509             }
    510         }
    511 
    512         uint32_t numAttribs = maxSlot + 1;
    513 
    514         typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
    515 
    516         BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
    517 
    518         // set up new PA for binning clipped primitives
    519         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
    520         if (NumVertsPerPrim == 3)
    521         {
    522             clipTopology = TOP_TRIANGLE_FAN;
    523 
    524             // so that the binner knows to bloat wide points later
    525             if (pa.binTopology == TOP_POINT_LIST)
    526             {
    527                 clipTopology = TOP_POINT_LIST;
    528             }
    529         }
    530         else if (NumVertsPerPrim == 2)
    531         {
    532             clipTopology = TOP_LINE_LIST;
    533         }
    534         else
    535         {
    536             SWR_ASSERT(0 && "Unexpected points in clipper.");
    537         }
    538 
    539         const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
    540         const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
    541         const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
    542         const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
    543 
    544         const SIMD256::Integer vOffsets = SIMD256::set_epi32(
    545             0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
    546             6 * sizeof(SIMDVERTEX_T<SIMD_T>),
    547             5 * sizeof(SIMDVERTEX_T<SIMD_T>),
    548             4 * sizeof(SIMDVERTEX_T<SIMD_T>),
    549             3 * sizeof(SIMDVERTEX_T<SIMD_T>),
    550             2 * sizeof(SIMDVERTEX_T<SIMD_T>),
    551             1 * sizeof(SIMDVERTEX_T<SIMD_T>),
    552             0 * sizeof(SIMDVERTEX_T<SIMD_T>));
    553 
    554         // only need to gather 7 verts
    555         // @todo dynamic mask based on actual # of verts generated per lane
    556         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
    557 
    558         uint32_t numClippedPrims = 0;
    559 
    560         // tranpose clipper output so that each lane's vertices are in SIMD order
    561         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
    562         // for triangle fan
    563 
    564 #if defined(_DEBUG)
    565         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
    566         SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
    567 
    568 #else
    569         SIMDVERTEX_T<SIMD_T> transposedPrims[2];
    570 
    571 #endif
    572         uint32_t numInputPrims = pa.NumPrims();
    573         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
    574         {
    575             uint32_t numEmittedVerts = pVertexCount[inputPrim];
    576             if (numEmittedVerts < NumVertsPerPrim)
    577             {
    578                 continue;
    579             }
    580             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
    581 
    582             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
    583             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
    584 
    585             numClippedPrims += numEmittedPrims;
    586 
    587             // tranpose clipper output so that each lane's vertices are in SIMD order
    588             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
    589             // for triangle fan
    590 
    591             // transpose pos
    592             uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
    593 
    594 #if 0
    595             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
    596             static const float *dummy = reinterpret_cast<const float *>(pBase);
    597 
    598 #endif
    599             for (uint32_t c = 0; c < 4; ++c)
    600             {
    601                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
    602                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
    603                 pBase += sizeof(typename SIMD_T::Float);
    604             }
    605 
    606             // transpose attribs
    607             pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
    608 
    609             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
    610             {
    611                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
    612 
    613                 for (uint32_t c = 0; c < 4; ++c)
    614                 {
    615                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
    616                     transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
    617                     pBase += sizeof(typename SIMD_T::Float);
    618                 }
    619             }
    620 
    621             // transpose user clip distances if enabled
    622             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
    623             if (state.backendState.clipDistanceMask & 0x0f)
    624             {
    625                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
    626 
    627                 for (uint32_t c = 0; c < 4; ++c)
    628                 {
    629                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
    630                     transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
    631                     pBase += sizeof(typename SIMD_T::Float);
    632                 }
    633             }
    634 
    635             if (state.backendState.clipDistanceMask & 0xf0)
    636             {
    637                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
    638 
    639                 for (uint32_t c = 0; c < 4; ++c)
    640                 {
    641                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
    642                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
    643                     pBase += sizeof(typename SIMD_T::Float);
    644                 }
    645             }
    646 
    647             PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
    648             clipPA.viewportArrayActive = pa.viewportArrayActive;
    649             clipPA.rtArrayActive = pa.rtArrayActive;
    650 
    651             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
    652 
    653             const uint32_t primMask = primMaskMap[numEmittedPrims];
    654 
    655             const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
    656             const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
    657             const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
    658 
    659 
    660             while (clipPA.GetNextStreamOutput())
    661             {
    662                 do
    663                 {
    664                     typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
    665 
    666                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
    667 
    668                     if (assemble)
    669                     {
    670                         binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
    671                     }
    672 
    673                 } while (clipPA.NextPrim());
    674             }
    675         }
    676 
    677 #if defined(_DEBUG)
    678         AlignedFree(transposedPrims);
    679 
    680 #endif
    681         // update global pipeline stat
    682         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
    683     }
    684 
    685     void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
    686                       typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
    687     {
    688         SWR_ASSERT(pa.pDC != nullptr);
    689 
    690         SWR_CONTEXT *pContext = pa.pDC->pContext;
    691 
    692         BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
    693 
    694         // update clipper invocations pipeline stat
    695         uint32_t numInvoc = _mm_popcnt_u32(primMask);
    696         UPDATE_STAT_FE(CInvocations, numInvoc);
    697 
    698         ComputeClipCodes(prim, viewportIdx);
    699 
    700         // cull prims with NAN coords
    701         primMask &= ~ComputeNaNMask(prim);
    702 
    703         // user cull distance cull
    704         if (state.backendState.cullDistanceMask)
    705         {
    706             primMask &= ~ComputeUserClipCullMask(pa, prim);
    707         }
    708 
    709         // cull prims outside view frustum
    710         typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
    711         int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
    712 
    713         // skip clipping for points
    714         uint32_t clipMask = 0;
    715         if (NumVertsPerPrim != 1)
    716         {
    717             clipMask = primMask & ComputeClipMask();
    718         }
    719 
    720         if (clipMask)
    721         {
    722             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
    723             // we have to clip tris, execute the clipper, which will also
    724             // call the binner
    725             ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
    726             AR_END(FEGuardbandClip, 1);
    727         }
    728         else if (validMask)
    729         {
    730             // update CPrimitives pipeline state
    731             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
    732 
    733             // forward valid prims directly to binner
    734             binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
    735         }
    736     }
    737 
    738 private:
    739     typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
    740     {
    741         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
    742     }
    743 
    744     typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
    745     {
    746         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
    747         const uint32_t componentStride  = sizeof(typename SIMD_T::Float);
    748         const uint32_t attribStride     = sizeof(typename SIMD_T::Vec4);
    749 
    750         static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
    751         {
    752             0 * sizeof(float),
    753             1 * sizeof(float),
    754             2 * sizeof(float),
    755             3 * sizeof(float),
    756             4 * sizeof(float),
    757             5 * sizeof(float),
    758             6 * sizeof(float),
    759             7 * sizeof(float),
    760             8 * sizeof(float),
    761             9 * sizeof(float),
    762             10 * sizeof(float),
    763             11 * sizeof(float),
    764             12 * sizeof(float),
    765             13 * sizeof(float),
    766             14 * sizeof(float),
    767             15 * sizeof(float),
    768         };
    769 
    770         static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
    771 
    772         typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
    773 
    774         // step to the simdvertex
    775         typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
    776 
    777         // step to the attribute and component
    778         vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
    779 
    780         // step to the lane
    781         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
    782 
    783         return vOffsets;
    784     }
    785 
    786     typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
    787     {
    788         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
    789         typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
    790 
    791         return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
    792     }
    793 
    794     void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
    795     {
    796         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
    797 
    798         const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
    799         const float *pSrc = reinterpret_cast<const float *>(&vSrc);
    800         uint32_t mask = SIMD_T::movemask_ps(vMask);
    801         DWORD lane;
    802         while (_BitScanForward(&lane, mask))
    803         {
    804             mask &= ~(1 << lane);
    805             const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
    806             *(float *)pBuf = pSrc[lane];
    807         }
    808     }
    809 
    810     template<SWR_CLIPCODES ClippingPlane>
    811     void intersect(
    812         const typename SIMD_T::Float &vActiveMask,  // active lanes to operate on
    813         const typename SIMD_T::Integer &s,          // index to first edge vertex v0 in pInPts.
    814         const typename SIMD_T::Integer &p,          // index to second edge vertex v1 in pInPts.
    815         const typename SIMD_T::Vec4 &v1,            // vertex 0 position
    816         const typename SIMD_T::Vec4 &v2,            // vertex 1 position
    817         typename SIMD_T::Integer &outIndex,         // output index.
    818         const float *pInVerts,                      // array of all the input positions.
    819         uint32_t numInAttribs,                      // number of attributes per vertex.
    820         float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
    821     {
    822         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
    823         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
    824 
    825         // compute interpolation factor
    826         typename SIMD_T::Float t;
    827         switch (ClippingPlane)
    828         {
    829         case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
    830         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
    831         case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
    832         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
    833         case FRUSTUM_NEAR:
    834             // DX Znear plane is 0, GL is -w
    835             if (this->state.rastState.clipHalfZ)
    836             {
    837                 t = ComputeInterpFactor(v1[2], v2[2]);
    838             }
    839             else
    840             {
    841                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
    842             }
    843             break;
    844         case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
    845         default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
    846         };
    847 
    848         // interpolate position and store
    849         for (uint32_t c = 0; c < 4; ++c)
    850         {
    851             typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
    852             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
    853         }
    854 
    855         // interpolate attributes and store
    856         for (uint32_t a = 0; a < numInAttribs; ++a)
    857         {
    858             uint32_t attribSlot = vertexAttribOffset + a;
    859             for (uint32_t c = 0; c < 4; ++c)
    860             {
    861                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
    862                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
    863                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
    864                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
    865             }
    866         }
    867 
    868         // interpolate clip distance if enabled
    869         if (this->state.backendState.clipDistanceMask & 0xf)
    870         {
    871             uint32_t attribSlot = vertexClipCullOffset;
    872             for (uint32_t c = 0; c < 4; ++c)
    873             {
    874                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
    875                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
    876                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
    877                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
    878             }
    879         }
    880 
    881         if (this->state.backendState.clipDistanceMask & 0xf0)
    882         {
    883             uint32_t attribSlot = vertexClipCullOffset + 1;
    884             for (uint32_t c = 0; c < 4; ++c)
    885             {
    886                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
    887                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
    888                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
    889                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
    890             }
    891         }
    892     }
    893 
    894     template<SWR_CLIPCODES ClippingPlane>
    895     typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
    896     {
    897         switch (ClippingPlane)
    898         {
    899         case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
    900         case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
    901         case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
    902         case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
    903         case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
    904         case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
    905         default:
    906             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
    907             return SIMD_T::setzero_ps();
    908         }
    909     }
    910 
    911     template<SWR_CLIPCODES ClippingPlane>
    912     typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
    913     {
    914         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
    915 
    916         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
    917         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
    918         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
    919 
    920         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
    921         {
    922             typename SIMD_T::Integer s = vCurIndex;
    923             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
    924             typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
    925             p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
    926 
    927             // gather position
    928             typename SIMD_T::Vec4 vInPos0, vInPos1;
    929             for (uint32_t c = 0; c < 4; ++c)
    930             {
    931                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
    932                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
    933             }
    934 
    935             // compute inside mask
    936             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
    937             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
    938 
    939             // compute intersection mask (s_in != p_in)
    940             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
    941             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
    942 
    943             // store s if inside
    944             s_in = SIMD_T::and_ps(s_in, vActiveMask);
    945             if (!SIMD_T::testz_ps(s_in, s_in))
    946             {
    947                 // store position
    948                 for (uint32_t c = 0; c < 4; ++c)
    949                 {
    950                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
    951                 }
    952 
    953                 // store attribs
    954                 for (uint32_t a = 0; a < numInAttribs; ++a)
    955                 {
    956                     uint32_t attribSlot = vertexAttribOffset + a;
    957                     for (uint32_t c = 0; c < 4; ++c)
    958                     {
    959                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    960                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    961                     }
    962                 }
    963 
    964                 // store clip distance if enabled
    965                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
    966                 if (this->state.backendState.clipDistanceMask & 0xf)
    967                 {
    968                     uint32_t attribSlot = vertexClipCullSlot;
    969                     for (uint32_t c = 0; c < 4; ++c)
    970                     {
    971                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    972                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    973                     }
    974                 }
    975 
    976                 if (this->state.backendState.clipDistanceMask & 0xf0)
    977                 {
    978                     uint32_t attribSlot = vertexClipCullSlot + 1;
    979                     for (uint32_t c = 0; c < 4; ++c)
    980                     {
    981                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    982                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    983                     }
    984                 }
    985 
    986                 // increment outIndex
    987                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
    988             }
    989 
    990             // compute and store intersection
    991             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
    992             {
    993                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
    994 
    995                 // increment outIndex for active lanes
    996                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
    997             }
    998 
    999             // increment loop index and update active mask
   1000             vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
   1001             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
   1002         }
   1003 
   1004         return vOutIndex;
   1005     }
   1006 
   1007     template<SWR_CLIPCODES ClippingPlane>
   1008     typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
   1009     {
   1010         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
   1011 
   1012         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
   1013         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
   1014         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
   1015 
   1016         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
   1017         {
   1018             typename SIMD_T::Integer s = vCurIndex;
   1019             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
   1020 
   1021             // gather position
   1022             typename SIMD_T::Vec4 vInPos0, vInPos1;
   1023             for (uint32_t c = 0; c < 4; ++c)
   1024             {
   1025                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
   1026                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
   1027             }
   1028 
   1029             // compute inside mask
   1030             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
   1031             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
   1032 
   1033             // compute intersection mask (s_in != p_in)
   1034             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
   1035             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
   1036 
   1037             // store s if inside
   1038             s_in = SIMD_T::and_ps(s_in, vActiveMask);
   1039             if (!SIMD_T::testz_ps(s_in, s_in))
   1040             {
   1041                 for (uint32_t c = 0; c < 4; ++c)
   1042                 {
   1043                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
   1044                 }
   1045 
   1046                 // interpolate attributes and store
   1047                 for (uint32_t a = 0; a < numInAttribs; ++a)
   1048                 {
   1049                     uint32_t attribSlot = vertexAttribOffset + a;
   1050                     for (uint32_t c = 0; c < 4; ++c)
   1051                     {
   1052                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
   1053                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
   1054                     }
   1055                 }
   1056 
   1057                 // increment outIndex
   1058                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
   1059             }
   1060 
   1061             // compute and store intersection
   1062             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
   1063             {
   1064                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
   1065 
   1066                 // increment outIndex for active lanes
   1067                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
   1068             }
   1069 
   1070             // store p if inside
   1071             p_in = SIMD_T::and_ps(p_in, vActiveMask);
   1072             if (!SIMD_T::testz_ps(p_in, p_in))
   1073             {
   1074                 for (uint32_t c = 0; c < 4; ++c)
   1075                 {
   1076                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
   1077                 }
   1078 
   1079                 // interpolate attributes and store
   1080                 for (uint32_t a = 0; a < numInAttribs; ++a)
   1081                 {
   1082                     uint32_t attribSlot = vertexAttribOffset + a;
   1083                     for (uint32_t c = 0; c < 4; ++c)
   1084                     {
   1085                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
   1086                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
   1087                     }
   1088                 }
   1089 
   1090                 // increment outIndex
   1091                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
   1092             }
   1093         }
   1094 
   1095         return vOutIndex;
   1096     }
   1097 
   1098     typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
   1099     {
   1100         // temp storage
   1101         float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
   1102 
   1103         // zero out num input verts for non-active lanes
   1104         typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
   1105         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
   1106 
   1107         // clip prims to frustum
   1108         typename SIMD_T::Integer vNumOutPts;
   1109         if (NumVertsPerPrim == 3)
   1110         {
   1111             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
   1112             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
   1113             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
   1114             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
   1115             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
   1116             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
   1117         }
   1118         else
   1119         {
   1120             SWR_ASSERT(NumVertsPerPrim == 2);
   1121             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
   1122             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
   1123             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
   1124             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
   1125             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
   1126             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
   1127         }
   1128 
   1129         // restore num verts for non-clipped, active lanes
   1130         typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
   1131         vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
   1132 
   1133         return vNumOutPts;
   1134     }
   1135 
   1136     const uint32_t workerId{ 0 };
   1137     DRAW_CONTEXT *pDC{ nullptr };
   1138     const API_STATE &state;
   1139     typename SIMD_T::Float clipCodes[NumVertsPerPrim];
   1140 };
   1141 
   1142 
   1143 // pipeline stage functions
   1144 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
   1145 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
   1146 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
   1147 #if USE_SIMD16_FRONTEND
   1148 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
   1149 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
   1150 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
   1151 #endif
   1152 
   1153