Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file clip.h
     24 *
     25 * @brief Definitions for clipping
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include "common/simdintrin.h"
     31 #include "core/context.h"
     32 #include "core/pa.h"
     33 #include "rdtsc_core.h"
     34 
     35 // Temp storage used by the clipper
     36 extern THREAD simdvertex tlsTempVertices[7];
     37 
     38 enum SWR_CLIPCODES
     39 {
     40     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
     41     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
     42 #define CLIPCODE_SHIFT 23
     43     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
     44     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
     45     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
     46     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
     47 
     48     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
     49     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
     50 
     51     NEGW            = (0x40 << CLIPCODE_SHIFT),
     52 
     53     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
     54     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
     55     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
     56     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
     57 };
     58 
     59 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
     60 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
     61 
     62 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
     63           int *numVerts, float *pOutAttribs);
     64 
     65 INLINE
     66 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
     67 {
     68     clipCodes = _simd_setzero_ps();
     69 
     70     // -w
     71     simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
     72 
     73     // FRUSTUM_LEFT
     74     simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
     75     clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
     76 
     77     // FRUSTUM_TOP
     78     vRes = _simd_cmplt_ps(vertex.y, vNegW);
     79     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
     80 
     81     // FRUSTUM_RIGHT
     82     vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
     83     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
     84 
     85     // FRUSTUM_BOTTOM
     86     vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
     87     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
     88 
     89     if (state.rastState.depthClipEnable)
     90     {
     91         // FRUSTUM_NEAR
     92         // DX clips depth [0..w], GL clips [-w..w]
     93         if (state.rastState.clipHalfZ)
     94         {
     95             vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
     96         }
     97         else
     98         {
     99             vRes = _simd_cmplt_ps(vertex.z, vNegW);
    100         }
    101         clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
    102 
    103         // FRUSTUM_FAR
    104         vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
    105         clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
    106     }
    107 
    108     // NEGW
    109     vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
    110     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
    111 
    112     // GUARDBAND_LEFT
    113     simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
    114     vRes = _simd_cmplt_ps(vertex.x, gbMult);
    115     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
    116 
    117     // GUARDBAND_TOP
    118     gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
    119     vRes = _simd_cmplt_ps(vertex.y, gbMult);
    120     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
    121 
    122     // GUARDBAND_RIGHT
    123     gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
    124     vRes = _simd_cmpgt_ps(vertex.x, gbMult);
    125     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
    126 
    127     // GUARDBAND_BOTTOM
    128     gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
    129     vRes = _simd_cmpgt_ps(vertex.y, gbMult);
    130     clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
    131 }
    132 
    133 template<uint32_t NumVertsPerPrim>
    134 class Clipper
    135 {
    136 public:
    137     Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
    138         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
    139     {
    140         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
    141     }
    142 
    143     void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
    144     {
    145         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    146         {
    147             ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
    148         }
    149     }
    150 
    151     simdscalar ComputeClipCodeIntersection()
    152     {
    153         simdscalar result = this->clipCodes[0];
    154         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
    155         {
    156             result = _simd_and_ps(result, this->clipCodes[i]);
    157         }
    158         return result;
    159     }
    160 
    161     simdscalar ComputeClipCodeUnion()
    162     {
    163         simdscalar result = this->clipCodes[0];
    164         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
    165         {
    166             result = _simd_or_ps(result, this->clipCodes[i]);
    167         }
    168         return result;
    169     }
    170 
    171     int ComputeNegWMask()
    172     {
    173         simdscalar clipCodeUnion = ComputeClipCodeUnion();
    174         clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
    175         return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
    176     }
    177 
    178     int ComputeClipMask()
    179     {
    180         simdscalar clipUnion = ComputeClipCodeUnion();
    181         clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
    182         return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
    183     }
    184 
    185     // clipper is responsible for culling any prims with NAN coordinates
    186     int ComputeNaNMask(simdvector prim[])
    187     {
    188         simdscalar vNanMask = _simd_setzero_ps();
    189         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
    190         {
    191             simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
    192             vNanMask = _simd_or_ps(vNanMask, vNan01);
    193             simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
    194             vNanMask = _simd_or_ps(vNanMask, vNan23);
    195         }
    196 
    197         return _simd_movemask_ps(vNanMask);
    198     }
    199 
    200     int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
    201     {
    202         uint8_t cullMask = this->state.rastState.cullDistanceMask;
    203         simdscalar vClipCullMask = _simd_setzero_ps();
    204         DWORD index;
    205 
    206         simdvector vClipCullDistLo[3];
    207         simdvector vClipCullDistHi[3];
    208 
    209         pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
    210         pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
    211         while (_BitScanForward(&index, cullMask))
    212         {
    213             cullMask &= ~(1 << index);
    214             uint32_t slot = index >> 2;
    215             uint32_t component = index & 0x3;
    216 
    217             simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
    218             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
    219             {
    220                 simdscalar vCullComp;
    221                 if (slot == 0)
    222                 {
    223                     vCullComp = vClipCullDistLo[e][component];
    224                 }
    225                 else
    226                 {
    227                     vCullComp = vClipCullDistHi[e][component];
    228                 }
    229 
    230                 // cull if cull distance < 0 || NAN
    231                 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
    232                 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
    233             }
    234             vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
    235         }
    236 
    237         // clipper should also discard any primitive with NAN clip distance
    238         uint8_t clipMask = this->state.rastState.clipDistanceMask;
    239         while (_BitScanForward(&index, clipMask))
    240         {
    241             clipMask &= ~(1 << index);
    242             uint32_t slot = index >> 2;
    243             uint32_t component = index & 0x3;
    244 
    245             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
    246             {
    247                 simdscalar vClipComp;
    248                 if (slot == 0)
    249                 {
    250                     vClipComp = vClipCullDistLo[e][component];
    251                 }
    252                 else
    253                 {
    254                     vClipComp = vClipCullDistHi[e][component];
    255                 }
    256 
    257                 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
    258                 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
    259             }
    260         }
    261 
    262         return _simd_movemask_ps(vClipCullMask);
    263     }
    264 
    265     // clip SIMD primitives
    266     void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
    267     {
    268         // input/output vertex store for clipper
    269         simdvertex vertices[7]; // maximum 7 verts generated per triangle
    270 
    271         LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
    272         uint32_t provokingVertex = 0;
    273         if(pa.binTopology == TOP_TRIANGLE_FAN)
    274         {
    275             provokingVertex = this->state.frontendState.provokingVertex.triFan;
    276         }
    277         ///@todo: line topology for wireframe?
    278 
    279         // assemble pos
    280         simdvector tmpVector[NumVertsPerPrim];
    281         pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
    282         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    283         {
    284             vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
    285         }
    286 
    287         // assemble attribs
    288         const SWR_BACKEND_STATE& backendState = this->state.backendState;
    289 
    290         int32_t maxSlot = -1;
    291         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
    292         {
    293             // Compute absolute attrib slot in vertex array
    294             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
    295             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
    296             uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
    297 
    298             pa.Assemble(inputSlot, tmpVector);
    299 
    300             // if constant interpolation enabled for this attribute, assign the provoking
    301             // vertex values to all edges
    302             if (_bittest(&constantInterpMask, slot))
    303             {
    304                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    305                 {
    306                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
    307                 }
    308             }
    309             else
    310             {
    311                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    312                 {
    313                     vertices[i].attrib[inputSlot] = tmpVector[i];
    314                 }
    315             }
    316         }
    317 
    318         // assemble user clip distances if enabled
    319         if (this->state.rastState.clipDistanceMask & 0xf)
    320         {
    321             pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
    322             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    323             {
    324                 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
    325             }
    326         }
    327 
    328         if (this->state.rastState.clipDistanceMask & 0xf0)
    329         {
    330             pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
    331             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
    332             {
    333                 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
    334             }
    335         }
    336 
    337         uint32_t numAttribs = maxSlot + 1;
    338 
    339         simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
    340 
    341         // set up new PA for binning clipped primitives
    342         PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
    343         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
    344         if (NumVertsPerPrim == 3)
    345         {
    346             pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
    347             clipTopology = TOP_TRIANGLE_FAN;
    348 
    349             // so that the binner knows to bloat wide points later
    350             if (pa.binTopology == TOP_POINT_LIST)
    351                 clipTopology = TOP_POINT_LIST;
    352 
    353         }
    354         else if (NumVertsPerPrim == 2)
    355         {
    356             pfnBinFunc = BinLines;
    357             clipTopology = TOP_LINE_LIST;
    358         }
    359         else
    360         {
    361             SWR_ASSERT(0 && "Unexpected points in clipper.");
    362         }
    363 
    364         uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
    365         uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
    366         uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
    367 
    368         const simdscalari vOffsets = _mm256_set_epi32(
    369             0 * sizeof(simdvertex),  // unused lane
    370             6 * sizeof(simdvertex),
    371             5 * sizeof(simdvertex),
    372             4 * sizeof(simdvertex),
    373             3 * sizeof(simdvertex),
    374             2 * sizeof(simdvertex),
    375             1 * sizeof(simdvertex),
    376             0 * sizeof(simdvertex));
    377 
    378         // only need to gather 7 verts
    379         // @todo dynamic mask based on actual # of verts generated per lane
    380         const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
    381 
    382         uint32_t numClippedPrims = 0;
    383         for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
    384         {
    385             uint32_t numEmittedVerts = pVertexCount[inputPrim];
    386             if (numEmittedVerts < NumVertsPerPrim)
    387             {
    388                 continue;
    389             }
    390             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
    391 
    392             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
    393             numClippedPrims += numEmittedPrims;
    394 
    395             // tranpose clipper output so that each lane's vertices are in SIMD order
    396             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
    397             // for triangle fan
    398             simdvertex transposedPrims[2];
    399 
    400             // transpose pos
    401             uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
    402             for (uint32_t c = 0; c < 4; ++c)
    403             {
    404                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
    405                 pBase += sizeof(simdscalar);
    406             }
    407 
    408             // transpose attribs
    409             pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
    410             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
    411             {
    412                 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
    413                 for (uint32_t c = 0; c < 4; ++c)
    414                 {
    415                     transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
    416                     pBase += sizeof(simdscalar);
    417                 }
    418             }
    419 
    420             // transpose user clip distances if enabled
    421             if (this->state.rastState.clipDistanceMask & 0xf)
    422             {
    423                 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
    424                 for (uint32_t c = 0; c < 4; ++c)
    425                 {
    426                     transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
    427                     pBase += sizeof(simdscalar);
    428                 }
    429             }
    430 
    431             if (this->state.rastState.clipDistanceMask & 0xf0)
    432             {
    433                 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
    434                 for (uint32_t c = 0; c < 4; ++c)
    435                 {
    436                     transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
    437                     pBase += sizeof(simdscalar);
    438                 }
    439             }
    440 
    441             PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
    442 
    443             while (clipPa.GetNextStreamOutput())
    444             {
    445                 do
    446                 {
    447                     simdvector attrib[NumVertsPerPrim];
    448                     bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
    449                     if (assemble)
    450                     {
    451                         static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
    452                         pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
    453                     }
    454                 } while (clipPa.NextPrim());
    455             }
    456         }
    457 
    458         // update global pipeline stat
    459         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
    460     }
    461 
    462     // execute the clipper stage
    463     void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
    464     {
    465         SWR_ASSERT(pa.pDC != nullptr);
    466         SWR_CONTEXT* pContext = pa.pDC->pContext;
    467 
    468         // set up binner based on PA state
    469         PFN_PROCESS_PRIMS pfnBinner;
    470         switch (pa.binTopology)
    471         {
    472         case TOP_POINT_LIST:
    473             pfnBinner = BinPoints;
    474             break;
    475         case TOP_LINE_LIST:
    476         case TOP_LINE_STRIP:
    477         case TOP_LINE_LOOP:
    478         case TOP_LINE_LIST_ADJ:
    479         case TOP_LISTSTRIP_ADJ:
    480             pfnBinner = BinLines;
    481             break;
    482         default:
    483             pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
    484             break;
    485         };
    486 
    487         // update clipper invocations pipeline stat
    488         uint32_t numInvoc = _mm_popcnt_u32(primMask);
    489         UPDATE_STAT_FE(CInvocations, numInvoc);
    490 
    491         ComputeClipCodes(prim, viewportIdx);
    492 
    493         // cull prims with NAN coords
    494         primMask &= ~ComputeNaNMask(prim);
    495 
    496         // user cull distance cull
    497         if (this->state.rastState.cullDistanceMask)
    498         {
    499             primMask &= ~ComputeUserClipCullMask(pa, prim);
    500         }
    501 
    502         // cull prims outside view frustum
    503         simdscalar clipIntersection = ComputeClipCodeIntersection();
    504         int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
    505 
    506         // skip clipping for points
    507         uint32_t clipMask = 0;
    508         if (NumVertsPerPrim != 1)
    509         {
    510             clipMask = primMask & ComputeClipMask();
    511         }
    512 
    513         if (clipMask)
    514         {
    515             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
    516             // we have to clip tris, execute the clipper, which will also
    517             // call the binner
    518             ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
    519             AR_END(FEGuardbandClip, 1);
    520         }
    521         else if (validMask)
    522         {
    523             // update CPrimitives pipeline state
    524             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
    525 
    526             // forward valid prims directly to binner
    527             pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
    528         }
    529     }
    530 
    531 private:
    532     inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
    533     {
    534         return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
    535     }
    536 
    537     inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
    538     {
    539         const uint32_t simdVertexStride = sizeof(simdvertex);
    540         const uint32_t componentStride = sizeof(simdscalar);
    541         const uint32_t attribStride = sizeof(simdvector);
    542         const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
    543             3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
    544 
    545         // step to the simdvertex
    546         simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
    547 
    548         // step to the attribute and component
    549         vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
    550 
    551         // step to the lane
    552         vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
    553 
    554         return vOffsets;
    555     }
    556 
    557     // gathers a single component for a given attribute for each SIMD lane
    558     inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
    559     {
    560         simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
    561         simdscalar vSrc = _mm256_undefined_ps();
    562         return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
    563     }
    564 
    565     inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
    566     {
    567         simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
    568 
    569         uint32_t* pOffsets = (uint32_t*)&vOffsets;
    570         float* pSrc = (float*)&vSrc;
    571         uint32_t mask = _simd_movemask_ps(vMask);
    572         DWORD lane;
    573         while (_BitScanForward(&lane, mask))
    574         {
    575             mask &= ~(1 << lane);
    576             uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
    577             *(float*)pBuf = pSrc[lane];
    578         }
    579     }
    580 
    581     template<SWR_CLIPCODES ClippingPlane>
    582     inline void intersect(
    583         const simdscalar& vActiveMask,  // active lanes to operate on
    584         const simdscalari& s,           // index to first edge vertex v0 in pInPts.
    585         const simdscalari& p,           // index to second edge vertex v1 in pInPts.
    586         const simdvector& v1,           // vertex 0 position
    587         const simdvector& v2,           // vertex 1 position
    588         simdscalari& outIndex,          // output index.
    589         const float *pInVerts,          // array of all the input positions.
    590         uint32_t numInAttribs,          // number of attributes per vertex.
    591         float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
    592     {
    593         // compute interpolation factor
    594         simdscalar t;
    595         switch (ClippingPlane)
    596         {
    597         case FRUSTUM_LEFT:      t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
    598         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
    599         case FRUSTUM_TOP:       t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
    600         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
    601         case FRUSTUM_NEAR:
    602             // DX Znear plane is 0, GL is -w
    603             if (this->state.rastState.clipHalfZ)
    604             {
    605                 t = ComputeInterpFactor(v1[2], v2[2]);
    606             }
    607             else
    608             {
    609                 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
    610             }
    611             break;
    612         case FRUSTUM_FAR:       t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
    613         default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
    614         };
    615 
    616         // interpolate position and store
    617         for (uint32_t c = 0; c < 4; ++c)
    618         {
    619             simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
    620             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
    621         }
    622 
    623         // interpolate attributes and store
    624         for (uint32_t a = 0; a < numInAttribs; ++a)
    625         {
    626             uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
    627             for (uint32_t c = 0; c < 4; ++c)
    628             {
    629                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
    630                 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
    631                 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
    632                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
    633             }
    634         }
    635 
    636         // interpolate clip distance if enabled
    637         if (this->state.rastState.clipDistanceMask & 0xf)
    638         {
    639             uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
    640             for (uint32_t c = 0; c < 4; ++c)
    641             {
    642                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
    643                 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
    644                 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
    645                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
    646             }
    647         }
    648 
    649         if (this->state.rastState.clipDistanceMask & 0xf0)
    650         {
    651             uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
    652             for (uint32_t c = 0; c < 4; ++c)
    653             {
    654                 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
    655                 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
    656                 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
    657                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
    658             }
    659         }
    660     }
    661 
    662     template<SWR_CLIPCODES ClippingPlane>
    663     inline simdscalar inside(const simdvector& v)
    664     {
    665         switch (ClippingPlane)
    666         {
    667         case FRUSTUM_LEFT:      return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
    668         case FRUSTUM_RIGHT:     return _simd_cmple_ps(v[0], v[3]);
    669         case FRUSTUM_TOP:       return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
    670         case FRUSTUM_BOTTOM:    return _simd_cmple_ps(v[1], v[3]);
    671         case FRUSTUM_NEAR:      return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
    672         case FRUSTUM_FAR:       return _simd_cmple_ps(v[2], v[3]);
    673         default:
    674             SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
    675             return _simd_setzero_ps();
    676         }
    677     }
    678 
    679     template<SWR_CLIPCODES ClippingPlane>
    680     simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
    681     {
    682         simdscalari vCurIndex = _simd_setzero_si();
    683         simdscalari vOutIndex = _simd_setzero_si();
    684         simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
    685 
    686         while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
    687         {
    688             simdscalari s = vCurIndex;
    689             simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
    690             simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
    691             p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
    692 
    693             // gather position
    694             simdvector vInPos0, vInPos1;
    695             for (uint32_t c = 0; c < 4; ++c)
    696             {
    697                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
    698                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
    699             }
    700 
    701             // compute inside mask
    702             simdscalar s_in = inside<ClippingPlane>(vInPos0);
    703             simdscalar p_in = inside<ClippingPlane>(vInPos1);
    704 
    705             // compute intersection mask (s_in != p_in)
    706             simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
    707             intersectMask = _simd_and_ps(intersectMask, vActiveMask);
    708 
    709             // store s if inside
    710             s_in = _simd_and_ps(s_in, vActiveMask);
    711             if (!_simd_testz_ps(s_in, s_in))
    712             {
    713                 // store position
    714                 for (uint32_t c = 0; c < 4; ++c)
    715                 {
    716                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
    717                 }
    718 
    719                 // store attribs
    720                 for (uint32_t a = 0; a < numInAttribs; ++a)
    721                 {
    722                     uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
    723                     for (uint32_t c = 0; c < 4; ++c)
    724                     {
    725                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    726                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    727                     }
    728                 }
    729 
    730                 // store clip distance if enabled
    731                 if (this->state.rastState.clipDistanceMask & 0xf)
    732                 {
    733                     uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
    734                     for (uint32_t c = 0; c < 4; ++c)
    735                     {
    736                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    737                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    738                     }
    739                 }
    740 
    741                 if (this->state.rastState.clipDistanceMask & 0xf0)
    742                 {
    743                     uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
    744                     for (uint32_t c = 0; c < 4; ++c)
    745                     {
    746                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    747                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    748                     }
    749                 }
    750 
    751                 // increment outIndex
    752                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
    753             }
    754 
    755             // compute and store intersection
    756             if (!_simd_testz_ps(intersectMask, intersectMask))
    757             {
    758                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
    759 
    760                 // increment outIndex for active lanes
    761                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
    762             }
    763 
    764             // increment loop index and update active mask
    765             vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
    766             vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
    767         }
    768 
    769         return vOutIndex;
    770     }
    771 
    772     template<SWR_CLIPCODES ClippingPlane>
    773     simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
    774     {
    775         simdscalari vCurIndex = _simd_setzero_si();
    776         simdscalari vOutIndex = _simd_setzero_si();
    777         simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
    778 
    779         if (!_simd_testz_ps(vActiveMask, vActiveMask))
    780         {
    781             simdscalari s = vCurIndex;
    782             simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
    783 
    784             // gather position
    785             simdvector vInPos0, vInPos1;
    786             for (uint32_t c = 0; c < 4; ++c)
    787             {
    788                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
    789                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
    790             }
    791 
    792             // compute inside mask
    793             simdscalar s_in = inside<ClippingPlane>(vInPos0);
    794             simdscalar p_in = inside<ClippingPlane>(vInPos1);
    795 
    796             // compute intersection mask (s_in != p_in)
    797             simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
    798             intersectMask = _simd_and_ps(intersectMask, vActiveMask);
    799 
    800             // store s if inside
    801             s_in = _simd_and_ps(s_in, vActiveMask);
    802             if (!_simd_testz_ps(s_in, s_in))
    803             {
    804                 for (uint32_t c = 0; c < 4; ++c)
    805                 {
    806                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
    807                 }
    808 
    809                 // interpolate attributes and store
    810                 for (uint32_t a = 0; a < numInAttribs; ++a)
    811                 {
    812                     uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
    813                     for (uint32_t c = 0; c < 4; ++c)
    814                     {
    815                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
    816                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
    817                     }
    818                 }
    819 
    820                 // increment outIndex
    821                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
    822             }
    823 
    824             // compute and store intersection
    825             if (!_simd_testz_ps(intersectMask, intersectMask))
    826             {
    827                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
    828 
    829                 // increment outIndex for active lanes
    830                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
    831             }
    832 
    833             // store p if inside
    834             p_in = _simd_and_ps(p_in, vActiveMask);
    835             if (!_simd_testz_ps(p_in, p_in))
    836             {
    837                 for (uint32_t c = 0; c < 4; ++c)
    838                 {
    839                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
    840                 }
    841 
    842                 // interpolate attributes and store
    843                 for (uint32_t a = 0; a < numInAttribs; ++a)
    844                 {
    845                     uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
    846                     for (uint32_t c = 0; c < 4; ++c)
    847                     {
    848                         simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
    849                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
    850                     }
    851                 }
    852 
    853                 // increment outIndex
    854                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
    855             }
    856         }
    857 
    858         return vOutIndex;
    859     }
    860 
    861     //////////////////////////////////////////////////////////////////////////
    862     /// @brief Vertical clipper. Clips SIMD primitives at a time
    863     /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
    864     /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
    865     /// @param numAttribs - number of valid input attribs, including position
    866     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
    867     {
    868         // temp storage
    869         float* pTempVerts = (float*)&tlsTempVertices[0];
    870 
    871         // zero out num input verts for non-active lanes
    872         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
    873         vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
    874 
    875         // clip prims to frustum
    876         simdscalari vNumOutPts;
    877         if (NumVertsPerPrim == 3)
    878         {
    879             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
    880             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
    881             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
    882             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
    883             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
    884             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
    885         }
    886         else
    887         {
    888             SWR_ASSERT(NumVertsPerPrim == 2);
    889             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
    890             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
    891             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
    892             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
    893             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
    894             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
    895         }
    896 
    897         // restore num verts for non-clipped, active lanes
    898         simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
    899         vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
    900 
    901         return vNumOutPts;
    902     }
    903 
    904     const uint32_t workerId{ 0 };
    905     DRAW_CONTEXT* pDC{ nullptr };
    906     const API_STATE& state;
    907     simdscalar clipCodes[NumVertsPerPrim];
    908 };
    909 
    910 
    911 // pipeline stage functions
    912 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
    913 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
    914 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
    915