Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file pa_avx.cpp
     24 *
     25 * @brief AVX implementation for primitive assembly.
     26 *        N primitives are assembled at a time, where N is the SIMD width.
     27 *        A state machine, that is specific for a given topology, drives the
     28 *        assembly of vertices into triangles.
     29 *
     30 ******************************************************************************/
     31 #include "context.h"
     32 #include "pa.h"
     33 #include "frontend.h"
     34 
     35 #if (KNOB_SIMD_WIDTH == 8)
     36 
     37 INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     38 {
     39     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     40     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     41     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
     42 }
     43 
     44 INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     45 {
     46     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     47     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     48     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
     49 }
     50 
     51 INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     52 {
     53     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     54     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     55     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
     56 }
     57 
     58 INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     59 {
     60     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     61     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     62     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
     63 }
     64 
     65 INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     66 {
     67     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     68     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     69     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
     70 }
     71 
     72 INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     73 {
     74     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     75     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     76     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
     77 }
     78 
     79 INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     80 {
     81     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     82     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     83     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
     84 }
     85 
     86 INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
     87 {
     88     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     89     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     90     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
     91 }
     92 
     93 INLINE simd4scalar swizzleLane0(const simdvector &v)
     94 {
     95     return swizzleLane0(v.x, v.y, v.z, v.w);
     96 }
     97 
     98 INLINE simd4scalar swizzleLane1(const simdvector &v)
     99 {
    100     return swizzleLane1(v.x, v.y, v.z, v.w);
    101 }
    102 
    103 INLINE simd4scalar swizzleLane2(const simdvector &v)
    104 {
    105     return swizzleLane2(v.x, v.y, v.z, v.w);
    106 }
    107 
    108 INLINE simd4scalar swizzleLane3(const simdvector &v)
    109 {
    110     return swizzleLane3(v.x, v.y, v.z, v.w);
    111 }
    112 
    113 INLINE simd4scalar swizzleLane4(const simdvector &v)
    114 {
    115     return swizzleLane4(v.x, v.y, v.z, v.w);
    116 }
    117 
    118 INLINE simd4scalar swizzleLane5(const simdvector &v)
    119 {
    120     return swizzleLane5(v.x, v.y, v.z, v.w);
    121 }
    122 
    123 INLINE simd4scalar swizzleLane6(const simdvector &v)
    124 {
    125     return swizzleLane6(v.x, v.y, v.z, v.w);
    126 }
    127 
    128 INLINE simd4scalar swizzleLane7(const simdvector &v)
    129 {
    130     return swizzleLane7(v.x, v.y, v.z, v.w);
    131 }
    132 
    133 INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
    134 {
    135     switch (lane)
    136     {
    137     case 0:
    138         return swizzleLane0(v);
    139     case 1:
    140         return swizzleLane1(v);
    141     case 2:
    142         return swizzleLane2(v);
    143     case 3:
    144         return swizzleLane3(v);
    145     case 4:
    146         return swizzleLane4(v);
    147     case 5:
    148         return swizzleLane5(v);
    149     case 6:
    150         return swizzleLane6(v);
    151     case 7:
    152         return swizzleLane7(v);
    153     default:
    154         return _mm_setzero_ps();
    155     }
    156 }
    157 
    158 #if ENABLE_AVX512_SIMD16
    159 INLINE simd4scalar swizzleLane0(const simd16vector &v)
    160 {
    161     return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    162 }
    163 
    164 INLINE simd4scalar swizzleLane1(const simd16vector &v)
    165 {
    166     return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    167 }
    168 
    169 INLINE simd4scalar swizzleLane2(const simd16vector &v)
    170 {
    171     return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    172 }
    173 
    174 INLINE simd4scalar swizzleLane3(const simd16vector &v)
    175 {
    176     return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    177 }
    178 
    179 INLINE simd4scalar swizzleLane4(const simd16vector &v)
    180 {
    181     return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    182 }
    183 
    184 INLINE simd4scalar swizzleLane5(const simd16vector &v)
    185 {
    186     return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    187 }
    188 
    189 INLINE simd4scalar swizzleLane6(const simd16vector &v)
    190 {
    191     return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    192 }
    193 
    194 INLINE simd4scalar swizzleLane7(const simd16vector &v)
    195 {
    196     return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
    197 }
    198 
    199 INLINE simd4scalar swizzleLane8(const simd16vector &v)
    200 {
    201     return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    202 }
    203 
    204 INLINE simd4scalar swizzleLane9(const simd16vector &v)
    205 {
    206     return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    207 }
    208 
    209 INLINE simd4scalar swizzleLaneA(const simd16vector &v)
    210 {
    211     return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    212 }
    213 
    214 INLINE simd4scalar swizzleLaneB(const simd16vector &v)
    215 {
    216     return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    217 }
    218 
    219 INLINE simd4scalar swizzleLaneC(const simd16vector &v)
    220 {
    221     return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    222 }
    223 
    224 INLINE simd4scalar swizzleLaneD(const simd16vector &v)
    225 {
    226     return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    227 }
    228 
    229 INLINE simd4scalar swizzleLaneE(const simd16vector &v)
    230 {
    231     return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    232 }
    233 
    234 INLINE simd4scalar swizzleLaneF(const simd16vector &v)
    235 {
    236     return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
    237 }
    238 
    239 INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
    240 {
    241     switch (lane)
    242     {
    243     case 0:
    244         return swizzleLane0(v);
    245     case 1:
    246         return swizzleLane1(v);
    247     case 2:
    248         return swizzleLane2(v);
    249     case 3:
    250         return swizzleLane3(v);
    251     case 4:
    252         return swizzleLane4(v);
    253     case 5:
    254         return swizzleLane5(v);
    255     case 6:
    256         return swizzleLane6(v);
    257     case 7:
    258         return swizzleLane7(v);
    259     case 8:
    260         return swizzleLane8(v);
    261     case 9:
    262         return swizzleLane9(v);
    263     case 10:
    264         return swizzleLaneA(v);
    265     case 11:
    266         return swizzleLaneB(v);
    267     case 12:
    268         return swizzleLaneC(v);
    269     case 13:
    270         return swizzleLaneD(v);
    271     case 14:
    272         return swizzleLaneE(v);
    273     case 15:
    274         return swizzleLaneF(v);
    275     default:
    276         return _mm_setzero_ps();
    277     }
    278 }
    279 
    280 #endif
    281 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    282 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    283 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    284 #if ENABLE_AVX512_SIMD16
    285 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    286 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    287 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    288 #endif
    289 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    290 
    291 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    292 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    293 #if ENABLE_AVX512_SIMD16
    294 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    295 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    296 #endif
    297 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    298 
    299 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    300 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    301 #if ENABLE_AVX512_SIMD16
    302 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    303 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    304 #endif
    305 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    306 
    307 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    308 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    309 #if ENABLE_AVX512_SIMD16
    310 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    311 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    312 #endif
    313 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    314 
    315 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    316 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    317 #if ENABLE_AVX512_SIMD16
    318 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    319 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    320 #endif
    321 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    322 
    323 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    324 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    325 #if ENABLE_AVX512_SIMD16
    326 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    327 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    328 #endif
    329 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    330 
    331 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    332 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    333 #if ENABLE_AVX512_SIMD16
    334 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    335 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    336 #endif
    337 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    338 
    339 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    340 #if ENABLE_AVX512_SIMD16
    341 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    342 #endif
    343 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    344 
    345 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    346 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    347 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
    348 #if ENABLE_AVX512_SIMD16
    349 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    350 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    351 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
    352 #endif
    353 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
    354 
    355 template <uint32_t TotalControlPoints>
    356 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    357 {
    358     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
    359     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
    360     // Each attribute has 4 components.
    361 
    362     /// @todo Optimize this
    363 
    364 #if USE_SIMD16_FRONTEND
    365     if (pa.useAlternateOffset)
    366     {
    367         primIndex += KNOB_SIMD_WIDTH;
    368     }
    369 
    370 #endif
    371     float* pOutVec = (float*)verts;
    372 
    373     for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
    374     {
    375         uint32_t input_cp = primIndex * TotalControlPoints + cp;
    376 #if USE_SIMD16_FRONTEND
    377         uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
    378         uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
    379 
    380 #else
    381         uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
    382         uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
    383 
    384 #endif
    385         // Loop over all components of the attribute
    386         for (uint32_t i = 0; i < 4; ++i)
    387         {
    388 #if USE_SIMD16_FRONTEND
    389             const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
    390 #else
    391             const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
    392 #endif
    393             pOutVec[cp * 4 + i] = pInputVec[input_lane];
    394         }
    395     }
    396 }
    397 
    398 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
    399 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    400 {
    401     SetNextPaState(
    402         pa,
    403         PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
    404         PaPatchListSingle<TotalControlPoints>);
    405 
    406     return false;
    407 }
    408 
    409 template<uint32_t TotalControlPoints>
    410 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    411 {
    412     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
    413     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
    414     // Each attribute has 4 components.
    415 
    416     /// @todo Optimize this
    417 
    418 #if USE_SIMD16_FRONTEND
    419     uint32_t lane_offset = 0;
    420 
    421     if (pa.useAlternateOffset)
    422     {
    423         lane_offset = KNOB_SIMD_WIDTH;
    424     }
    425 
    426 #endif
    427     // Loop over all components of the attribute
    428     for (uint32_t i = 0; i < 4; ++i)
    429     {
    430         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
    431         {
    432             float vec[KNOB_SIMD_WIDTH];
    433             for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
    434             {
    435 #if USE_SIMD16_FRONTEND
    436                 uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp;
    437                 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
    438                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
    439 
    440                 const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
    441 #else
    442                 uint32_t input_cp = lane * TotalControlPoints + cp;
    443                 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
    444                 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
    445 
    446                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
    447 #endif
    448                 vec[lane] = pInputVec[input_lane];
    449             }
    450             verts[cp][i] = _simd_loadu_ps(vec);
    451         }
    452     }
    453 
    454     SetNextPaState(
    455         pa,
    456         PaPatchList<TotalControlPoints>,
    457         PaPatchListSingle<TotalControlPoints>,
    458         0,
    459         PA_STATE_OPT::SIMD_WIDTH,
    460         true);
    461 
    462     return true;
    463 }
    464 
    465 #if ENABLE_AVX512_SIMD16
    466 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
    467 static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
    468 {
    469     SetNextPaState_simd16(
    470         pa,
    471         PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
    472         PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
    473         PaPatchListSingle<TotalControlPoints>);
    474 
    475     return false;
    476 }
    477 
    478 template<uint32_t TotalControlPoints>
    479 static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
    480 {
    481     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
    482     // KNOB_SIMD16_WIDTH * 1 patch.  This function is called once per attribute.
    483     // Each attribute has 4 components.
    484 
    485     /// @todo Optimize this
    486 
    487     // Loop over all components of the attribute
    488     for (uint32_t i = 0; i < 4; ++i)
    489     {
    490         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
    491         {
    492             float vec[KNOB_SIMD16_WIDTH];
    493             for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
    494             {
    495                 uint32_t input_cp = lane * TotalControlPoints + cp;
    496                 uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
    497                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
    498 
    499                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
    500                 vec[lane] = pInputVec[input_lane];
    501             }
    502             verts[cp][i] = _simd16_loadu_ps(vec);
    503         }
    504     }
    505 
    506     SetNextPaState_simd16(
    507         pa,
    508         PaPatchList_simd16<TotalControlPoints>,
    509         PaPatchList<TotalControlPoints>,
    510         PaPatchListSingle<TotalControlPoints>,
    511         0,
    512         PA_STATE_OPT::SIMD_WIDTH,
    513         true);
    514 
    515     return true;
    516 }
    517 
    518 #endif
    519 #define PA_PATCH_LIST_TERMINATOR(N) \
    520     template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
    521                            { return PaPatchListTerm<N>(pa, slot, verts); }
    522 PA_PATCH_LIST_TERMINATOR(1)
    523 PA_PATCH_LIST_TERMINATOR(2)
    524 PA_PATCH_LIST_TERMINATOR(3)
    525 PA_PATCH_LIST_TERMINATOR(4)
    526 PA_PATCH_LIST_TERMINATOR(5)
    527 PA_PATCH_LIST_TERMINATOR(6)
    528 PA_PATCH_LIST_TERMINATOR(7)
    529 PA_PATCH_LIST_TERMINATOR(8)
    530 PA_PATCH_LIST_TERMINATOR(9)
    531 PA_PATCH_LIST_TERMINATOR(10)
    532 PA_PATCH_LIST_TERMINATOR(11)
    533 PA_PATCH_LIST_TERMINATOR(12)
    534 PA_PATCH_LIST_TERMINATOR(13)
    535 PA_PATCH_LIST_TERMINATOR(14)
    536 PA_PATCH_LIST_TERMINATOR(15)
    537 PA_PATCH_LIST_TERMINATOR(16)
    538 PA_PATCH_LIST_TERMINATOR(17)
    539 PA_PATCH_LIST_TERMINATOR(18)
    540 PA_PATCH_LIST_TERMINATOR(19)
    541 PA_PATCH_LIST_TERMINATOR(20)
    542 PA_PATCH_LIST_TERMINATOR(21)
    543 PA_PATCH_LIST_TERMINATOR(22)
    544 PA_PATCH_LIST_TERMINATOR(23)
    545 PA_PATCH_LIST_TERMINATOR(24)
    546 PA_PATCH_LIST_TERMINATOR(25)
    547 PA_PATCH_LIST_TERMINATOR(26)
    548 PA_PATCH_LIST_TERMINATOR(27)
    549 PA_PATCH_LIST_TERMINATOR(28)
    550 PA_PATCH_LIST_TERMINATOR(29)
    551 PA_PATCH_LIST_TERMINATOR(30)
    552 PA_PATCH_LIST_TERMINATOR(31)
    553 PA_PATCH_LIST_TERMINATOR(32)
    554 #undef PA_PATCH_LIST_TERMINATOR
    555 
    556 #if ENABLE_AVX512_SIMD16
    557 #define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
    558     template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\
    559                            { return PaPatchListTerm_simd16<N>(pa, slot, verts); }
    560 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
    561 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
    562 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
    563 PA_PATCH_LIST_TERMINATOR_SIMD16(4)
    564 PA_PATCH_LIST_TERMINATOR_SIMD16(5)
    565 PA_PATCH_LIST_TERMINATOR_SIMD16(6)
    566 PA_PATCH_LIST_TERMINATOR_SIMD16(7)
    567 PA_PATCH_LIST_TERMINATOR_SIMD16(8)
    568 PA_PATCH_LIST_TERMINATOR_SIMD16(9)
    569 PA_PATCH_LIST_TERMINATOR_SIMD16(10)
    570 PA_PATCH_LIST_TERMINATOR_SIMD16(11)
    571 PA_PATCH_LIST_TERMINATOR_SIMD16(12)
    572 PA_PATCH_LIST_TERMINATOR_SIMD16(13)
    573 PA_PATCH_LIST_TERMINATOR_SIMD16(14)
    574 PA_PATCH_LIST_TERMINATOR_SIMD16(15)
    575 PA_PATCH_LIST_TERMINATOR_SIMD16(16)
    576 PA_PATCH_LIST_TERMINATOR_SIMD16(17)
    577 PA_PATCH_LIST_TERMINATOR_SIMD16(18)
    578 PA_PATCH_LIST_TERMINATOR_SIMD16(19)
    579 PA_PATCH_LIST_TERMINATOR_SIMD16(20)
    580 PA_PATCH_LIST_TERMINATOR_SIMD16(21)
    581 PA_PATCH_LIST_TERMINATOR_SIMD16(22)
    582 PA_PATCH_LIST_TERMINATOR_SIMD16(23)
    583 PA_PATCH_LIST_TERMINATOR_SIMD16(24)
    584 PA_PATCH_LIST_TERMINATOR_SIMD16(25)
    585 PA_PATCH_LIST_TERMINATOR_SIMD16(26)
    586 PA_PATCH_LIST_TERMINATOR_SIMD16(27)
    587 PA_PATCH_LIST_TERMINATOR_SIMD16(28)
    588 PA_PATCH_LIST_TERMINATOR_SIMD16(29)
    589 PA_PATCH_LIST_TERMINATOR_SIMD16(30)
    590 PA_PATCH_LIST_TERMINATOR_SIMD16(31)
    591 PA_PATCH_LIST_TERMINATOR_SIMD16(32)
    592 #undef PA_PATCH_LIST_TERMINATOR_SIMD16
    593 
    594 #endif
    595 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    596 {
    597     SetNextPaState(pa, PaTriList1, PaTriListSingle0);
    598     return false;    // Not enough vertices to assemble 4 or 8 triangles.
    599 }
    600 
    601 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    602 {
    603     SetNextPaState(pa, PaTriList2, PaTriListSingle0);
    604     return false;    // Not enough vertices to assemble 8 triangles.
    605 }
    606 
    607 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    608 {
    609 #if KNOB_ARCH == KNOB_ARCH_AVX
    610 #if USE_SIMD16_FRONTEND
    611     simdvector a;
    612     simdvector b;
    613     simdvector c;
    614 
    615     if (!pa.useAlternateOffset)
    616     {
    617         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
    618         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
    619 
    620         for (uint32_t i = 0; i < 4; i += 1)
    621         {
    622             a[i] = _simd16_extract_ps(a_16[i], 0);
    623             b[i] = _simd16_extract_ps(a_16[i], 1);
    624             c[i] = _simd16_extract_ps(b_16[i], 0);
    625         }
    626     }
    627     else
    628     {
    629         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
    630         const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
    631 
    632         for (uint32_t i = 0; i < 4; i += 1)
    633         {
    634             a[i] = _simd16_extract_ps(b_16[i], 1);
    635             b[i] = _simd16_extract_ps(c_16[i], 0);
    636             c[i] = _simd16_extract_ps(c_16[i], 1);
    637         }
    638     }
    639 
    640 #else
    641     simdvector &a = PaGetSimdVector(pa, 0, slot);
    642     simdvector &b = PaGetSimdVector(pa, 1, slot);
    643     simdvector &c = PaGetSimdVector(pa, 2, slot);
    644 
    645 #endif
    646     simdscalar s;
    647 
    648     // Tri Pattern - provoking vertex is always v0
    649     //  v0 -> 0 3 6 9  12 15 18 21
    650     //  v1 -> 1 4 7 10 13 16 19 22
    651     //  v2 -> 2 5 8 11 14 17 20 23
    652 
    653     for (int i = 0; i < 4; ++i)
    654     {
    655         simdvector& v0 = verts[0];
    656         v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
    657         v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
    658         v0[i] = _mm256_permute_ps(v0[i], 0x6C);
    659         s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
    660         v0[i] = _simd_blend_ps(v0[i], s, 0x44);
    661 
    662         simdvector& v1 = verts[1];
    663         v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
    664         v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
    665         v1[i] = _mm256_permute_ps(v1[i], 0xB1);
    666         s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
    667         v1[i] = _simd_blend_ps(v1[i], s, 0x66);
    668 
    669         simdvector& v2 = verts[2];
    670         v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
    671         v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
    672         v2[i] = _mm256_permute_ps(v2[i], 0xC6);
    673         s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
    674         v2[i] = _simd_blend_ps(v2[i], s, 0x22);
    675     }
    676 
    677 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
    678     const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
    679     const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
    680     const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
    681 
    682 #if USE_SIMD16_FRONTEND
    683     simdvector a;
    684     simdvector b;
    685     simdvector c;
    686 
    687     if (!pa.useAlternateOffset)
    688     {
    689         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
    690         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
    691 
    692         for (uint32_t i = 0; i < 4; i += 1)
    693         {
    694             a[i] = _simd16_extract_ps(a_16[i], 0);
    695             b[i] = _simd16_extract_ps(a_16[i], 1);
    696             c[i] = _simd16_extract_ps(b_16[i], 0);
    697         }
    698     }
    699     else
    700     {
    701         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
    702         const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
    703 
    704         for (uint32_t i = 0; i < 4; i += 1)
    705         {
    706             a[i] = _simd16_extract_ps(b_16[i], 1);
    707             b[i] = _simd16_extract_ps(c_16[i], 0);
    708             c[i] = _simd16_extract_ps(c_16[i], 1);
    709         }
    710     }
    711 
    712 #else
    713     const simdvector &a = PaGetSimdVector(pa, 0, slot);
    714     const simdvector &b = PaGetSimdVector(pa, 1, slot);
    715     const simdvector &c = PaGetSimdVector(pa, 2, slot);
    716 
    717 #endif
    718     //  v0 -> a0 a3 a6 b1 b4 b7 c2 c5
    719     //  v1 -> a1 a4 a7 b2 b5 c0 c3 c6
    720     //  v2 -> a2 a5 b0 b3 b6 c1 c4 c7
    721 
    722     simdvector &v0 = verts[0];
    723     simdvector &v1 = verts[1];
    724     simdvector &v2 = verts[2];
    725 
    726     // for simd x, y, z, and w
    727     for (int i = 0; i < 4; ++i)
    728     {
    729         simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
    730         simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
    731         simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
    732 
    733         v0[i] = _simd_permute_ps(temp0, perm0);
    734         v1[i] = _simd_permute_ps(temp1, perm1);
    735         v2[i] = _simd_permute_ps(temp2, perm2);
    736     }
    737 
    738 #endif
    739     SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
    740     return true;
    741 }
    742 
    743 #if ENABLE_AVX512_SIMD16
    744 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
    745 {
    746     SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
    747     return false;    // Not enough vertices to assemble 16 triangles
    748 }
    749 
    750 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
    751 {
    752     SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
    753     return false;    // Not enough vertices to assemble 16 triangles
    754 }
    755 
    756 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
    757 {
    758     const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11,  8, 5, 2, 15, 12,  9, 6, 3, 0);
    759     const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12,  9, 6, 3,  0, 13, 10, 7, 4, 1);
    760     const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3,  0, 13, 10, 7, 4,  1, 14, 11, 8, 5, 2);
    761 
    762     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
    763     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
    764     const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
    765 
    766     simd16vector &v0 = verts[0];
    767     simd16vector &v1 = verts[1];
    768     simd16vector &v2 = verts[2];
    769 
    770     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
    771     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
    772     //  v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
    773 
    774     // for simd16 x, y, z, and w
    775     for (int i = 0; i < 4; i += 1)
    776     {
    777         simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492);
    778         simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924);
    779         simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249);
    780 
    781         v0[i] = _simd16_permute_ps(temp0, perm0);
    782         v1[i] = _simd16_permute_ps(temp1, perm1);
    783         v2[i] = _simd16_permute_ps(temp2, perm2);
    784     }
    785 
    786     SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
    787     return true;
    788 }
    789 
    790 #endif
    791 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    792 {
    793 #if USE_SIMD16_FRONTEND
    794     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
    795     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
    796     const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
    797 
    798     if (pa.useAlternateOffset)
    799     {
    800         primIndex += KNOB_SIMD_WIDTH;
    801     }
    802 
    803     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
    804     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
    805     //  v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
    806 
    807     switch (primIndex)
    808     {
    809     case 0:
    810         verts[0] = swizzleLane0(a);
    811         verts[1] = swizzleLane1(a);
    812         verts[2] = swizzleLane2(a);
    813         break;
    814     case 1:
    815         verts[0] = swizzleLane3(a);
    816         verts[1] = swizzleLane4(a);
    817         verts[2] = swizzleLane5(a);
    818         break;
    819     case 2:
    820         verts[0] = swizzleLane6(a);
    821         verts[1] = swizzleLane7(a);
    822         verts[2] = swizzleLane8(a);
    823         break;
    824     case 3:
    825         verts[0] = swizzleLane9(a);
    826         verts[1] = swizzleLaneA(a);
    827         verts[2] = swizzleLaneB(a);
    828         break;
    829     case 4:
    830         verts[0] = swizzleLaneC(a);
    831         verts[1] = swizzleLaneD(a);
    832         verts[2] = swizzleLaneE(a);
    833         break;
    834     case 5:
    835         verts[0] = swizzleLaneF(a);
    836         verts[1] = swizzleLane0(b);
    837         verts[2] = swizzleLane1(b);
    838         break;
    839     case 6:
    840         verts[0] = swizzleLane2(b);
    841         verts[1] = swizzleLane3(b);
    842         verts[2] = swizzleLane4(b);
    843         break;
    844     case 7:
    845         verts[0] = swizzleLane5(b);
    846         verts[1] = swizzleLane6(b);
    847         verts[2] = swizzleLane7(b);
    848         break;
    849     case 8:
    850         verts[0] = swizzleLane8(b);
    851         verts[1] = swizzleLane9(b);
    852         verts[2] = swizzleLaneA(b);
    853         break;
    854     case 9:
    855         verts[0] = swizzleLaneB(b);
    856         verts[1] = swizzleLaneC(b);
    857         verts[2] = swizzleLaneD(b);
    858         break;
    859     case 10:
    860         verts[0] = swizzleLaneE(b);
    861         verts[1] = swizzleLaneF(b);
    862         verts[2] = swizzleLane0(c);
    863         break;
    864     case 11:
    865         verts[0] = swizzleLane1(c);
    866         verts[1] = swizzleLane2(c);
    867         verts[2] = swizzleLane3(c);
    868         break;
    869     case 12:
    870         verts[0] = swizzleLane4(c);
    871         verts[1] = swizzleLane5(c);
    872         verts[2] = swizzleLane6(c);
    873         break;
    874     case 13:
    875         verts[0] = swizzleLane7(c);
    876         verts[1] = swizzleLane8(c);
    877         verts[2] = swizzleLane9(c);
    878         break;
    879     case 14:
    880         verts[0] = swizzleLaneA(c);
    881         verts[1] = swizzleLaneB(c);
    882         verts[2] = swizzleLaneC(c);
    883         break;
    884     case 15:
    885         verts[0] = swizzleLaneD(c);
    886         verts[1] = swizzleLaneE(c);
    887         verts[2] = swizzleLaneF(c);
    888         break;
    889     };
    890 #else
    891     // We have 12 simdscalars contained within 3 simdvectors which
    892     // hold at least 8 triangles worth of data. We want to assemble a single
    893     // triangle with data in horizontal form.
    894 
    895     const simdvector &a = PaGetSimdVector(pa, 0, slot);
    896     const simdvector &b = PaGetSimdVector(pa, 1, slot);
    897     const simdvector &c = PaGetSimdVector(pa, 2, slot);
    898 
    899     // Convert from vertical to horizontal.
    900     // Tri Pattern - provoking vertex is always v0
    901     //  v0 -> 0 3 6 9  12 15 18 21
    902     //  v1 -> 1 4 7 10 13 16 19 22
    903     //  v2 -> 2 5 8 11 14 17 20 23
    904 
    905     switch (primIndex)
    906     {
    907     case 0:
    908         verts[0] = swizzleLane0(a);
    909         verts[1] = swizzleLane1(a);
    910         verts[2] = swizzleLane2(a);
    911         break;
    912     case 1:
    913         verts[0] = swizzleLane3(a);
    914         verts[1] = swizzleLane4(a);
    915         verts[2] = swizzleLane5(a);
    916         break;
    917     case 2:
    918         verts[0] = swizzleLane6(a);
    919         verts[1] = swizzleLane7(a);
    920         verts[2] = swizzleLane0(b);
    921         break;
    922     case 3:
    923         verts[0] = swizzleLane1(b);
    924         verts[1] = swizzleLane2(b);
    925         verts[2] = swizzleLane3(b);
    926         break;
    927     case 4:
    928         verts[0] = swizzleLane4(b);
    929         verts[1] = swizzleLane5(b);
    930         verts[2] = swizzleLane6(b);
    931         break;
    932     case 5:
    933         verts[0] = swizzleLane7(b);
    934         verts[1] = swizzleLane0(c);
    935         verts[2] = swizzleLane1(c);
    936         break;
    937     case 6:
    938         verts[0] = swizzleLane2(c);
    939         verts[1] = swizzleLane3(c);
    940         verts[2] = swizzleLane4(c);
    941         break;
    942     case 7:
    943         verts[0] = swizzleLane5(c);
    944         verts[1] = swizzleLane6(c);
    945         verts[2] = swizzleLane7(c);
    946         break;
    947     };
    948 #endif
    949 }
    950 
    951 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    952 {
    953     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
    954     return false;    // Not enough vertices to assemble 8 triangles.
    955 }
    956 
    957 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
    958 {
    959 #if USE_SIMD16_FRONTEND
    960     simdvector a;
    961     simdvector b;
    962 
    963     if (!pa.useAlternateOffset)
    964     {
    965         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
    966 
    967         for (uint32_t i = 0; i < 4; i += 1)
    968         {
    969             a[i] = _simd16_extract_ps(a_16[i], 0);
    970             b[i] = _simd16_extract_ps(a_16[i], 1);
    971         }
    972     }
    973     else
    974     {
    975         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
    976 
    977         for (uint32_t i = 0; i < 4; i += 1)
    978         {
    979             a[i] = _simd16_extract_ps(b_16[i], 0);
    980             b[i] = _simd16_extract_ps(b_16[i], 1);
    981         }
    982     }
    983 
    984 #else
    985     simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
    986     simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
    987 
    988 #endif
    989     simdscalar s;
    990 
    991     for(int i = 0; i < 4; ++i)
    992     {
    993         simdscalar a0 = a[i];
    994         simdscalar b0 = b[i];
    995 
    996         // Tri Pattern - provoking vertex is always v0
    997         //  v0 -> 01234567
    998         //  v1 -> 13355779
    999         //  v2 -> 22446688
   1000         simdvector& v0 = verts[0];
   1001         v0[i] = a0;
   1002 
   1003         //  s -> 4567891011
   1004         s = _simd_permute2f128_ps(a0, b0, 0x21);
   1005         //  s -> 23456789
   1006         s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
   1007 
   1008         simdvector& v1 = verts[1];
   1009         //  v1 -> 13355779
   1010         v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
   1011 
   1012         simdvector& v2 = verts[2];
   1013         //  v2 -> 22446688
   1014         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
   1015     }
   1016 
   1017     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   1018     return true;
   1019 }
   1020 
   1021 #if  ENABLE_AVX512_SIMD16
   1022 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1023 {
   1024     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
   1025     return false;    // Not enough vertices to assemble 16 triangles.
   1026 }
   1027 
   1028 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1029 {
   1030     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
   1031     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
   1032 
   1033     simd16vector &v0 = verts[0];
   1034     simd16vector &v1 = verts[1];
   1035     simd16vector &v2 = verts[2];
   1036 
   1037     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
   1038     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
   1039     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
   1040 
   1041     // for simd16 x, y, z, and w
   1042     for (int i = 0; i < 4; i += 1)
   1043     {
   1044         simd16scalar perm0 = _simd16_permute2f128_ps(a[i], a[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
   1045         simd16scalar perm1 = _simd16_permute2f128_ps(b[i], b[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
   1046 
   1047         simd16scalar blend = _simd16_blend_ps(perm0, perm1, 0xF000);                                // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
   1048         simd16scalar shuff = _simd16_shuffle_ps(a[i], blend, _MM_SHUFFLE(1, 0, 3, 2));              // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
   1049 
   1050         v0[i] = a[i];                                                                               // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
   1051         v1[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
   1052         v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2));                           // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
   1053     }
   1054 
   1055     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   1056     return true;
   1057 }
   1058 
   1059 #endif
   1060 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   1061 {
   1062 #if USE_SIMD16_FRONTEND
   1063     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
   1064     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
   1065 
   1066     if (pa.useAlternateOffset)
   1067     {
   1068         primIndex += KNOB_SIMD_WIDTH;
   1069     }
   1070 
   1071     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
   1072     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
   1073     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
   1074 
   1075     switch (primIndex)
   1076     {
   1077     case 0:
   1078         verts[0] = swizzleLane0(a);
   1079         verts[1] = swizzleLane1(a);
   1080         verts[2] = swizzleLane2(a);
   1081         break;
   1082     case 1:
   1083         verts[0] = swizzleLane1(a);
   1084         verts[1] = swizzleLane3(a);
   1085         verts[2] = swizzleLane2(a);
   1086         break;
   1087     case 2:
   1088         verts[0] = swizzleLane2(a);
   1089         verts[1] = swizzleLane3(a);
   1090         verts[2] = swizzleLane4(a);
   1091         break;
   1092     case 3:
   1093         verts[0] = swizzleLane3(a);
   1094         verts[1] = swizzleLane5(a);
   1095         verts[2] = swizzleLane4(a);
   1096         break;
   1097     case 4:
   1098         verts[0] = swizzleLane4(a);
   1099         verts[1] = swizzleLane5(a);
   1100         verts[2] = swizzleLane6(a);
   1101         break;
   1102     case 5:
   1103         verts[0] = swizzleLane5(a);
   1104         verts[1] = swizzleLane7(a);
   1105         verts[2] = swizzleLane6(a);
   1106         break;
   1107     case 6:
   1108         verts[0] = swizzleLane6(a);
   1109         verts[1] = swizzleLane7(a);
   1110         verts[2] = swizzleLane8(a);
   1111         break;
   1112     case 7:
   1113         verts[0] = swizzleLane7(a);
   1114         verts[1] = swizzleLane9(a);
   1115         verts[2] = swizzleLane8(a);
   1116         break;
   1117     case 8:
   1118         verts[0] = swizzleLane8(a);
   1119         verts[1] = swizzleLane9(a);
   1120         verts[2] = swizzleLaneA(a);
   1121         break;
   1122     case 9:
   1123         verts[0] = swizzleLane9(a);
   1124         verts[1] = swizzleLaneB(a);
   1125         verts[2] = swizzleLaneA(a);
   1126         break;
   1127     case 10:
   1128         verts[0] = swizzleLaneA(a);
   1129         verts[1] = swizzleLaneB(a);
   1130         verts[2] = swizzleLaneC(a);
   1131         break;
   1132     case 11:
   1133         verts[0] = swizzleLaneB(a);
   1134         verts[1] = swizzleLaneD(a);
   1135         verts[2] = swizzleLaneC(a);
   1136         break;
   1137     case 12:
   1138         verts[0] = swizzleLaneC(a);
   1139         verts[1] = swizzleLaneD(a);
   1140         verts[2] = swizzleLaneE(a);
   1141         break;
   1142     case 13:
   1143         verts[0] = swizzleLaneD(a);
   1144         verts[1] = swizzleLaneF(a);
   1145         verts[2] = swizzleLaneE(a);
   1146         break;
   1147     case 14:
   1148         verts[0] = swizzleLaneE(a);
   1149         verts[1] = swizzleLaneF(a);
   1150         verts[2] = swizzleLane0(b);
   1151         break;
   1152     case 15:
   1153         verts[0] = swizzleLaneF(a);
   1154         verts[1] = swizzleLane1(b);
   1155         verts[2] = swizzleLane0(b);
   1156         break;
   1157     };
   1158 #else
   1159     const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
   1160     const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
   1161 
   1162     // Convert from vertical to horizontal.
   1163     // Tri Pattern - provoking vertex is always v0
   1164     //  v0 -> 01234567
   1165     //  v1 -> 13355779
   1166     //  v2 -> 22446688
   1167 
   1168     switch (primIndex)
   1169     {
   1170     case 0:
   1171         verts[0] = swizzleLane0(a);
   1172         verts[1] = swizzleLane1(a);
   1173         verts[2] = swizzleLane2(a);
   1174         break;
   1175     case 1:
   1176         verts[0] = swizzleLane1(a);
   1177         verts[1] = swizzleLane3(a);
   1178         verts[2] = swizzleLane2(a);
   1179         break;
   1180     case 2:
   1181         verts[0] = swizzleLane2(a);
   1182         verts[1] = swizzleLane3(a);
   1183         verts[2] = swizzleLane4(a);
   1184         break;
   1185     case 3:
   1186         verts[0] = swizzleLane3(a);
   1187         verts[1] = swizzleLane5(a);
   1188         verts[2] = swizzleLane4(a);
   1189         break;
   1190     case 4:
   1191         verts[0] = swizzleLane4(a);
   1192         verts[1] = swizzleLane5(a);
   1193         verts[2] = swizzleLane6(a);
   1194         break;
   1195     case 5:
   1196         verts[0] = swizzleLane5(a);
   1197         verts[1] = swizzleLane7(a);
   1198         verts[2] = swizzleLane6(a);
   1199         break;
   1200     case 6:
   1201         verts[0] = swizzleLane6(a);
   1202         verts[1] = swizzleLane7(a);
   1203         verts[2] = swizzleLane0(b);
   1204         break;
   1205     case 7:
   1206         verts[0] = swizzleLane7(a);
   1207         verts[1] = swizzleLane1(b);
   1208         verts[2] = swizzleLane0(b);
   1209         break;
   1210     };
   1211 #endif
   1212 }
   1213 
   1214 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1215 {
   1216     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
   1217     return false;    // Not enough vertices to assemble 8 triangles.
   1218 }
   1219 
   1220 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1221 {
   1222 #if USE_SIMD16_FRONTEND
   1223     simdvector leadVert;
   1224     simdvector a;
   1225     simdvector b;
   1226 
   1227     const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
   1228 
   1229     if (!pa.useAlternateOffset)
   1230     {
   1231         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
   1232 
   1233         for (uint32_t i = 0; i < 4; i += 1)
   1234         {
   1235             leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
   1236 
   1237             a[i] = _simd16_extract_ps(a_16[i], 0);
   1238             b[i] = _simd16_extract_ps(a_16[i], 1);
   1239         }
   1240     }
   1241     else
   1242     {
   1243         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
   1244 
   1245         for (uint32_t i = 0; i < 4; i += 1)
   1246         {
   1247             leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
   1248 
   1249             a[i] = _simd16_extract_ps(b_16[i], 0);
   1250             b[i] = _simd16_extract_ps(b_16[i], 1);
   1251         }
   1252     }
   1253 
   1254 #else
   1255     const simdvector &leadVert = PaGetSimdVector(pa, pa.first, slot);
   1256     const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
   1257     const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
   1258 
   1259 #endif
   1260     simdscalar s;
   1261 
   1262     // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
   1263     for(int i = 0; i < 4; ++i)
   1264     {
   1265         simdscalar a0 = a[i];
   1266         simdscalar b0 = b[i];
   1267 
   1268         simdscalar comp = leadVert[i];
   1269 
   1270         simdvector& v0 = verts[0];
   1271         v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
   1272         v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00);
   1273 
   1274         simdvector& v2 = verts[2];
   1275         s = _simd_permute2f128_ps(a0, b0, 0x21);
   1276         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
   1277 
   1278         simdvector& v1 = verts[1];
   1279         v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
   1280     }
   1281 
   1282     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   1283     return true;
   1284 }
   1285 
   1286 #if ENABLE_AVX512_SIMD16
   1287 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1288 {
   1289     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
   1290     return false;    // Not enough vertices to assemble 16 triangles.
   1291 }
   1292 
   1293 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1294 {
   1295     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
   1296     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
   1297     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
   1298 
   1299     simd16vector &v0 = verts[0];
   1300     simd16vector &v1 = verts[1];
   1301     simd16vector &v2 = verts[2];
   1302 
   1303     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
   1304     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
   1305     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
   1306 
   1307     // for simd16 x, y, z, and w
   1308     for (uint32_t i = 0; i < 4; i += 1)
   1309     {
   1310         simd16scalar shuff = _simd16_shuffle_ps(a[i], a[i], _MM_SHUFFLE(0, 0, 0, 0));               // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
   1311 
   1312         v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00);                                        // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
   1313 
   1314         simd16scalar temp0 = _simd16_permute2f128_ps(b[i], b[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
   1315         simd16scalar temp1 = _simd16_permute2f128_ps(c[i], c[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
   1316 
   1317         simd16scalar blend = _simd16_blend_ps(temp0, temp1, 0xF000);                                // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
   1318 
   1319         v2[i] = _simd16_shuffle_ps(b[i], blend, _MM_SHUFFLE(1, 0, 3, 2));                           // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
   1320         v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1));                           // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
   1321     }
   1322 
   1323     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   1324     return true;
   1325 }
   1326 
   1327 #endif
   1328 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   1329 {
   1330 #if USE_SIMD16_FRONTEND
   1331     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
   1332     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
   1333     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
   1334 
   1335     if (pa.useAlternateOffset)
   1336     {
   1337         primIndex += KNOB_SIMD_WIDTH;
   1338     }
   1339 
   1340     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
   1341     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
   1342     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
   1343 
   1344     // vert 0 from leading vertex
   1345     verts[0] = swizzleLane0(a);
   1346 
   1347     // vert 1
   1348     if (primIndex < 15)
   1349     {
   1350         verts[1] = swizzleLaneN(b, primIndex + 1);
   1351     }
   1352     else
   1353     {
   1354         verts[1] = swizzleLane0(c);
   1355     }
   1356 
   1357     // vert 2
   1358     if (primIndex < 14)
   1359     {
   1360         verts[2] = swizzleLaneN(b, primIndex + 2);
   1361     }
   1362     else
   1363     {
   1364         verts[2] = swizzleLaneN(c, primIndex - 14);
   1365     }
   1366 #else
   1367     const simdvector &a = PaGetSimdVector(pa, pa.first, slot);
   1368     const simdvector &b = PaGetSimdVector(pa, pa.prev, slot);
   1369     const simdvector &c = PaGetSimdVector(pa, pa.cur, slot);
   1370 
   1371     // vert 0 from leading vertex
   1372     verts[0] = swizzleLane0(a);
   1373 
   1374     // vert 1
   1375     if (primIndex < 7)
   1376     {
   1377         verts[1] = swizzleLaneN(b, primIndex + 1);
   1378     }
   1379     else
   1380     {
   1381         verts[1] = swizzleLane0(c);
   1382     }
   1383 
   1384     // vert 2
   1385     if (primIndex < 6)
   1386     {
   1387         verts[2] = swizzleLaneN(b, primIndex + 2);
   1388     }
   1389     else
   1390     {
   1391         verts[2] = swizzleLaneN(c, primIndex - 6);
   1392     }
   1393 #endif
   1394 }
   1395 
   1396 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1397 {
   1398     SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
   1399     return false;    // Not enough vertices to assemble 8 triangles.
   1400 }
   1401 
   1402 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1403 {
   1404 #if USE_SIMD16_FRONTEND
   1405     simdvector a;
   1406     simdvector b;
   1407 
   1408     if (!pa.useAlternateOffset)
   1409     {
   1410         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
   1411 
   1412         for (uint32_t i = 0; i < 4; i += 1)
   1413         {
   1414             a[i] = _simd16_extract_ps(a_16[i], 0);
   1415             b[i] = _simd16_extract_ps(a_16[i], 1);
   1416         }
   1417     }
   1418     else
   1419     {
   1420         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
   1421 
   1422         for (uint32_t i = 0; i < 4; i += 1)
   1423         {
   1424             a[i] = _simd16_extract_ps(b_16[i], 0);
   1425             b[i] = _simd16_extract_ps(b_16[i], 1);
   1426         }
   1427     }
   1428 
   1429 #else
   1430     simdvector &a = PaGetSimdVector(pa, 0, slot);
   1431     simdvector &b = PaGetSimdVector(pa, 1, slot);
   1432 
   1433 #endif
   1434     simdscalar s1, s2;
   1435 
   1436     for(int i = 0; i < 4; ++i)
   1437     {
   1438         simdscalar a0 = a[i];
   1439         simdscalar b0 = b[i];
   1440 
   1441         s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
   1442         s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
   1443 
   1444         simdvector& v0 = verts[0];
   1445         v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
   1446 
   1447         simdvector& v1 = verts[1];
   1448         v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
   1449 
   1450         simdvector& v2 = verts[2];
   1451         v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
   1452     }
   1453 
   1454     SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   1455     return true;
   1456 }
   1457 
   1458 #if ENABLE_AVX512_SIMD16
   1459 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1460 {
   1461     SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
   1462     return false;    // Not enough vertices to assemble 16 triangles.
   1463 }
   1464 
   1465 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1466 {
   1467     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
   1468     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
   1469 
   1470     simd16vector &v0 = verts[0];
   1471     simd16vector &v1 = verts[1];
   1472     simd16vector &v2 = verts[2];
   1473 
   1474     //  v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
   1475     //  v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
   1476     //  v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
   1477 
   1478     // for simd16 x, y, z, and w
   1479     for (uint32_t i = 0; i < 4; i += 1)
   1480     {
   1481         simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) = 10 00 10 00  // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
   1482         simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) = 11 01 11 01  // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
   1483 
   1484         v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0));                          // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
   1485         v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1));                          // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
   1486         v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2));                          // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
   1487     }
   1488 
   1489     SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   1490     return true;
   1491 }
   1492 
   1493 #endif
   1494 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   1495 {
   1496 #if USE_SIMD16_FRONTEND
   1497     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
   1498     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
   1499 
   1500     if (pa.useAlternateOffset)
   1501     {
   1502         primIndex += KNOB_SIMD_WIDTH;
   1503     }
   1504 
   1505     switch (primIndex)
   1506     {
   1507     case 0:
   1508         // triangle 0 - 0 1 2
   1509         verts[0] = swizzleLane0(a);
   1510         verts[1] = swizzleLane1(a);
   1511         verts[2] = swizzleLane2(a);
   1512         break;
   1513     case 1:
   1514         // triangle 1 - 0 2 3
   1515         verts[0] = swizzleLane0(a);
   1516         verts[1] = swizzleLane2(a);
   1517         verts[2] = swizzleLane3(a);
   1518         break;
   1519     case 2:
   1520         // triangle 2 - 4 5 6
   1521         verts[0] = swizzleLane4(a);
   1522         verts[1] = swizzleLane5(a);
   1523         verts[2] = swizzleLane6(a);
   1524         break;
   1525     case 3:
   1526         // triangle 3 - 4 6 7
   1527         verts[0] = swizzleLane4(a);
   1528         verts[1] = swizzleLane6(a);
   1529         verts[2] = swizzleLane7(a);
   1530         break;
   1531     case 4:
   1532         // triangle 4 - 8 9 A
   1533         verts[0] = swizzleLane8(a);
   1534         verts[1] = swizzleLane9(a);
   1535         verts[2] = swizzleLaneA(a);
   1536         break;
   1537     case 5:
   1538         // triangle 5 - 8 A B
   1539         verts[0] = swizzleLane8(a);
   1540         verts[1] = swizzleLaneA(a);
   1541         verts[2] = swizzleLaneB(a);
   1542         break;
   1543     case 6:
   1544         // triangle 6 - C D E
   1545         verts[0] = swizzleLaneC(a);
   1546         verts[1] = swizzleLaneD(a);
   1547         verts[2] = swizzleLaneE(a);
   1548         break;
   1549     case 7:
   1550         // triangle 7 - C E F
   1551         verts[0] = swizzleLaneC(a);
   1552         verts[1] = swizzleLaneE(a);
   1553         verts[2] = swizzleLaneF(a);
   1554         break;
   1555     case 8:
   1556         // triangle 0 - 0 1 2
   1557         verts[0] = swizzleLane0(b);
   1558         verts[1] = swizzleLane1(b);
   1559         verts[2] = swizzleLane2(b);
   1560         break;
   1561     case 9:
   1562         // triangle 1 - 0 2 3
   1563         verts[0] = swizzleLane0(b);
   1564         verts[1] = swizzleLane2(b);
   1565         verts[2] = swizzleLane3(b);
   1566         break;
   1567     case 10:
   1568         // triangle 2 - 4 5 6
   1569         verts[0] = swizzleLane4(b);
   1570         verts[1] = swizzleLane5(b);
   1571         verts[2] = swizzleLane6(b);
   1572         break;
   1573     case 11:
   1574         // triangle 3 - 4 6 7
   1575         verts[0] = swizzleLane4(b);
   1576         verts[1] = swizzleLane6(b);
   1577         verts[2] = swizzleLane7(b);
   1578         break;
   1579     case 12:
   1580         // triangle 4 - 8 9 A
   1581         verts[0] = swizzleLane8(b);
   1582         verts[1] = swizzleLane9(b);
   1583         verts[2] = swizzleLaneA(b);
   1584         break;
   1585     case 13:
   1586         // triangle 5 - 8 A B
   1587         verts[0] = swizzleLane8(b);
   1588         verts[1] = swizzleLaneA(b);
   1589         verts[2] = swizzleLaneB(b);
   1590         break;
   1591     case 14:
   1592         // triangle 6 - C D E
   1593         verts[0] = swizzleLaneC(b);
   1594         verts[1] = swizzleLaneD(b);
   1595         verts[2] = swizzleLaneE(b);
   1596         break;
   1597     case 15:
   1598         // triangle 7 - C E F
   1599         verts[0] = swizzleLaneC(b);
   1600         verts[1] = swizzleLaneE(b);
   1601         verts[2] = swizzleLaneF(b);
   1602         break;
   1603     }
   1604 #else
   1605     const simdvector &a = PaGetSimdVector(pa, 0, slot);
   1606     const simdvector &b = PaGetSimdVector(pa, 1, slot);
   1607 
   1608     switch (primIndex)
   1609     {
   1610     case 0:
   1611         // triangle 0 - 0 1 2
   1612         verts[0] = swizzleLane0(a);
   1613         verts[1] = swizzleLane1(a);
   1614         verts[2] = swizzleLane2(a);
   1615         break;
   1616     case 1:
   1617         // triangle 1 - 0 2 3
   1618         verts[0] = swizzleLane0(a);
   1619         verts[1] = swizzleLane2(a);
   1620         verts[2] = swizzleLane3(a);
   1621         break;
   1622     case 2:
   1623         // triangle 2 - 4 5 6
   1624         verts[0] = swizzleLane4(a);
   1625         verts[1] = swizzleLane5(a);
   1626         verts[2] = swizzleLane6(a);
   1627         break;
   1628     case 3:
   1629         // triangle 3 - 4 6 7
   1630         verts[0] = swizzleLane4(a);
   1631         verts[1] = swizzleLane6(a);
   1632         verts[2] = swizzleLane7(a);
   1633         break;
   1634     case 4:
   1635         // triangle 4 - 8 9 10 (0 1 2)
   1636         verts[0] = swizzleLane0(b);
   1637         verts[1] = swizzleLane1(b);
   1638         verts[2] = swizzleLane2(b);
   1639         break;
   1640     case 5:
   1641         // triangle 1 - 0 2 3
   1642         verts[0] = swizzleLane0(b);
   1643         verts[1] = swizzleLane2(b);
   1644         verts[2] = swizzleLane3(b);
   1645         break;
   1646     case 6:
   1647         // triangle 2 - 4 5 6
   1648         verts[0] = swizzleLane4(b);
   1649         verts[1] = swizzleLane5(b);
   1650         verts[2] = swizzleLane6(b);
   1651         break;
   1652     case 7:
   1653         // triangle 3 - 4 6 7
   1654         verts[0] = swizzleLane4(b);
   1655         verts[1] = swizzleLane6(b);
   1656         verts[2] = swizzleLane7(b);
   1657         break;
   1658     }
   1659 #endif
   1660 }
   1661 
   1662 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1663 {
   1664     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
   1665     return false;
   1666 }
   1667 
   1668 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1669 {
   1670     PaLineStrip1(pa, slot, verts);
   1671 
   1672     if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1)
   1673     {
   1674         // loop reconnect now
   1675         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
   1676 
   1677 #if USE_SIMD16_FRONTEND
   1678         simdvector first;
   1679 
   1680         const simd16vector &first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
   1681 
   1682         if (!pa.useAlternateOffset)
   1683         {
   1684             for (uint32_t i = 0; i < 4; i += 1)
   1685             {
   1686                 first[i] = _simd16_extract_ps(first_16[i], 0);
   1687             }
   1688         }
   1689         else
   1690         {
   1691             for (uint32_t i = 0; i < 4; i += 1)
   1692             {
   1693                 first[i] = _simd16_extract_ps(first_16[i], 1);
   1694             }
   1695         }
   1696 
   1697 #else
   1698         simdvector &first = PaGetSimdVector(pa, pa.first, slot);
   1699 
   1700 #endif
   1701         for (int i = 0; i < 4; i++)
   1702         {
   1703             float *firstVtx = (float *)&(first[i]);
   1704             float *targetVtx = (float *)&(verts[1][i]);
   1705             targetVtx[lane] = firstVtx[0];
   1706         }
   1707     }
   1708 
   1709     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   1710     return true;
   1711 }
   1712 
   1713 #if ENABLE_AVX512_SIMD16
   1714 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1715 {
   1716     SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
   1717     return false;
   1718 }
   1719 
   1720 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1721 {
   1722     PaLineStrip1_simd16(pa, slot, verts);
   1723 
   1724     if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1)
   1725     {
   1726         // loop reconnect now
   1727         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
   1728 
   1729         const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
   1730 
   1731         for (int i = 0; i < 4; i++)
   1732         {
   1733             float *firstVtx = (float *)&(first[i]);
   1734             float *targetVtx = (float *)&(verts[1][i]);
   1735             targetVtx[lane] = firstVtx[0];
   1736         }
   1737     }
   1738 
   1739     SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   1740     return true;
   1741 }
   1742 
   1743 #endif
   1744 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   1745 {
   1746     PaLineStripSingle0(pa, slot, primIndex, verts);
   1747 
   1748     if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
   1749     {
   1750 #if USE_SIMD16_FRONTEND
   1751         const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
   1752 
   1753         verts[1] = swizzleLane0(first);
   1754 #else
   1755         const simdvector &first = PaGetSimdVector(pa, pa.first, slot);
   1756 
   1757         verts[1] = swizzleLane0(first);
   1758 #endif
   1759     }
   1760 }
   1761 
   1762 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1763 {
   1764     SetNextPaState(pa, PaLineList1, PaLineListSingle0);
   1765     return false;    // Not enough vertices to assemble 8 lines
   1766 }
   1767 
   1768 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1769 {
   1770 #if USE_SIMD16_FRONTEND
   1771     simdvector a;
   1772     simdvector b;
   1773 
   1774     if (!pa.useAlternateOffset)
   1775     {
   1776         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
   1777 
   1778         for (uint32_t i = 0; i < 4; i += 1)
   1779         {
   1780             a[i] = _simd16_extract_ps(a_16[i], 0);
   1781             b[i] = _simd16_extract_ps(a_16[i], 1);
   1782         }
   1783     }
   1784     else
   1785     {
   1786         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
   1787 
   1788         for (uint32_t i = 0; i < 4; i += 1)
   1789         {
   1790             a[i] = _simd16_extract_ps(b_16[i], 0);
   1791             b[i] = _simd16_extract_ps(b_16[i], 1);
   1792         }
   1793     }
   1794 
   1795 #else
   1796     simdvector &a = PaGetSimdVector(pa, 0, slot);
   1797     simdvector &b = PaGetSimdVector(pa, 1, slot);
   1798 
   1799 #endif
   1800     /// @todo: verify provoking vertex is correct
   1801     // Line list 0  1  2  3  4  5  6  7
   1802     //           8  9 10 11 12 13 14 15
   1803 
   1804     // shuffle:
   1805     //           0 2 4 6 8 10 12 14
   1806     //           1 3 5 7 9 11 13 15
   1807 
   1808     for (uint32_t i = 0; i < 4; ++i)
   1809     {
   1810         // 0 1 2 3 8 9 10 11
   1811         __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
   1812         // 4 5 6 7 12 13 14 15
   1813         __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
   1814 
   1815         // 0 2 4 6 8 10 12 14
   1816         verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
   1817         // 1 3 5 7 9 11 13 15
   1818         verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
   1819     }
   1820 
   1821     SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   1822     return true;
   1823 }
   1824 
   1825 #if ENABLE_AVX512_SIMD16
   1826 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1827 {
   1828     SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
   1829     return false;    // Not enough vertices to assemble 16 lines
   1830 }
   1831 
   1832 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   1833 {
   1834     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
   1835     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
   1836 
   1837     simd16vector &v0 = verts[0];
   1838     simd16vector &v1 = verts[1];
   1839 
   1840     // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
   1841     // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
   1842 
   1843     // for simd16 x, y, z, and w
   1844     for (int i = 0; i < 4; i += 1)
   1845     {
   1846         simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) 10 00 10 00    // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
   1847         simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) 11 01 11 01    // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
   1848 
   1849         v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));                          // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
   1850         v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));                          // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
   1851     }
   1852 
   1853     SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   1854     return true;
   1855 }
   1856 
   1857 #endif
   1858 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   1859 {
   1860 #if USE_SIMD16_FRONTEND
   1861     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
   1862     const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
   1863 
   1864     if (pa.useAlternateOffset)
   1865     {
   1866         primIndex += KNOB_SIMD_WIDTH;
   1867     }
   1868 
   1869     switch (primIndex)
   1870     {
   1871     case 0:
   1872         verts[0] = swizzleLane0(a);
   1873         verts[1] = swizzleLane1(a);
   1874         break;
   1875     case 1:
   1876         verts[0] = swizzleLane2(a);
   1877         verts[1] = swizzleLane3(a);
   1878         break;
   1879     case 2:
   1880         verts[0] = swizzleLane4(a);
   1881         verts[1] = swizzleLane5(a);
   1882         break;
   1883     case 3:
   1884         verts[0] = swizzleLane6(a);
   1885         verts[1] = swizzleLane7(a);
   1886         break;
   1887     case 4:
   1888         verts[0] = swizzleLane8(a);
   1889         verts[1] = swizzleLane9(a);
   1890         break;
   1891     case 5:
   1892         verts[0] = swizzleLaneA(a);
   1893         verts[1] = swizzleLaneB(a);
   1894         break;
   1895     case 6:
   1896         verts[0] = swizzleLaneC(a);
   1897         verts[1] = swizzleLaneD(a);
   1898         break;
   1899     case 7:
   1900         verts[0] = swizzleLaneE(a);
   1901         verts[1] = swizzleLaneF(a);
   1902         break;
   1903     case 8:
   1904         verts[0] = swizzleLane0(b);
   1905         verts[1] = swizzleLane1(b);
   1906         break;
   1907     case 9:
   1908         verts[0] = swizzleLane2(b);
   1909         verts[1] = swizzleLane3(b);
   1910         break;
   1911     case 10:
   1912         verts[0] = swizzleLane4(b);
   1913         verts[1] = swizzleLane5(b);
   1914         break;
   1915     case 11:
   1916         verts[0] = swizzleLane6(b);
   1917         verts[1] = swizzleLane7(b);
   1918         break;
   1919     case 12:
   1920         verts[0] = swizzleLane8(b);
   1921         verts[1] = swizzleLane9(b);
   1922         break;
   1923     case 13:
   1924         verts[0] = swizzleLaneA(b);
   1925         verts[1] = swizzleLaneB(b);
   1926         break;
   1927     case 14:
   1928         verts[0] = swizzleLaneC(b);
   1929         verts[1] = swizzleLaneD(b);
   1930         break;
   1931     case 15:
   1932         verts[0] = swizzleLaneE(b);
   1933         verts[1] = swizzleLaneF(b);
   1934         break;
   1935     }
   1936 #else
   1937     const simdvector &a = PaGetSimdVector(pa, 0, slot);
   1938     const simdvector &b = PaGetSimdVector(pa, 1, slot);
   1939 
   1940     switch (primIndex)
   1941     {
   1942     case 0:
   1943         verts[0] = swizzleLane0(a);
   1944         verts[1] = swizzleLane1(a);
   1945         break;
   1946     case 1:
   1947         verts[0] = swizzleLane2(a);
   1948         verts[1] = swizzleLane3(a);
   1949         break;
   1950     case 2:
   1951         verts[0] = swizzleLane4(a);
   1952         verts[1] = swizzleLane5(a);
   1953         break;
   1954     case 3:
   1955         verts[0] = swizzleLane6(a);
   1956         verts[1] = swizzleLane7(a);
   1957         break;
   1958     case 4:
   1959         verts[0] = swizzleLane0(b);
   1960         verts[1] = swizzleLane1(b);
   1961         break;
   1962     case 5:
   1963         verts[0] = swizzleLane2(b);
   1964         verts[1] = swizzleLane3(b);
   1965         break;
   1966     case 6:
   1967         verts[0] = swizzleLane4(b);
   1968         verts[1] = swizzleLane5(b);
   1969         break;
   1970     case 7:
   1971         verts[0] = swizzleLane6(b);
   1972         verts[1] = swizzleLane7(b);
   1973         break;
   1974     }
   1975 #endif
   1976 }
   1977 
   1978 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1979 {
   1980     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
   1981     return false;    // Not enough vertices to assemble 8 lines
   1982 }
   1983 
   1984 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   1985 {
   1986 #if USE_SIMD16_FRONTEND
   1987     simdvector a;
   1988     simdvector b;
   1989 
   1990     if (!pa.useAlternateOffset)
   1991     {
   1992         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
   1993 
   1994         for (uint32_t i = 0; i < 4; i += 1)
   1995         {
   1996             a[i] = _simd16_extract_ps(a_16[i], 0);
   1997             b[i] = _simd16_extract_ps(a_16[i], 1);
   1998         }
   1999     }
   2000     else
   2001     {
   2002         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
   2003 
   2004         for (uint32_t i = 0; i < 4; i += 1)
   2005         {
   2006             a[i] = _simd16_extract_ps(b_16[i], 0);
   2007             b[i] = _simd16_extract_ps(b_16[i], 1);
   2008         }
   2009     }
   2010 
   2011 #else
   2012     simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
   2013     simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
   2014 
   2015 #endif
   2016     /// @todo: verify provoking vertex is correct
   2017     // Line list 0  1  2  3  4  5  6  7
   2018     //           8  9 10 11 12 13 14 15
   2019 
   2020     // shuffle:
   2021     //           0  1  2  3  4  5  6  7
   2022     //           1  2  3  4  5  6  7  8
   2023 
   2024     verts[0] = a;
   2025 
   2026     for(uint32_t i = 0; i < 4; ++i)
   2027     {
   2028         // 1 2 3 x 5 6 7 x
   2029         __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
   2030         // 4 5 6 7 8 9 10 11
   2031         __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
   2032 
   2033         // x x x 4 x x x 8
   2034         __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
   2035 
   2036         verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
   2037     }
   2038 
   2039     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   2040     return true;
   2041 }
   2042 
   2043 #if ENABLE_AVX512_SIMD16
   2044 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   2045 {
   2046     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
   2047     return false;    // Not enough vertices to assemble 16 lines
   2048 }
   2049 
   2050 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   2051 {
   2052     const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
   2053 
   2054     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
   2055     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
   2056 
   2057     simd16vector &v0 = verts[0];
   2058     simd16vector &v1 = verts[1];
   2059 
   2060     // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
   2061     // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
   2062 
   2063     v0 = a;                                                                                         // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
   2064 
   2065     // for simd16 x, y, z, and w
   2066     for (int i = 0; i < 4; i += 1)
   2067     {
   2068         simd16scalar temp = _simd16_blend_ps(a[i], b[i], 0x0001);                                   // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
   2069 
   2070         v1[i] = _simd16_permute_ps(temp, perm);                                                     // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
   2071     }
   2072 
   2073     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
   2074     return true;
   2075 }
   2076 
   2077 #endif
   2078 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   2079 {
   2080 #if USE_SIMD16_FRONTEND
   2081     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
   2082     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
   2083 
   2084     if (pa.useAlternateOffset)
   2085     {
   2086         primIndex += KNOB_SIMD_WIDTH;
   2087     }
   2088 
   2089     switch (primIndex)
   2090     {
   2091     case 0:
   2092         verts[0] = swizzleLane0(a);
   2093         verts[1] = swizzleLane1(a);
   2094         break;
   2095     case 1:
   2096         verts[0] = swizzleLane1(a);
   2097         verts[1] = swizzleLane2(a);
   2098         break;
   2099     case 2:
   2100         verts[0] = swizzleLane2(a);
   2101         verts[1] = swizzleLane3(a);
   2102         break;
   2103     case 3:
   2104         verts[0] = swizzleLane3(a);
   2105         verts[1] = swizzleLane4(a);
   2106         break;
   2107     case 4:
   2108         verts[0] = swizzleLane4(a);
   2109         verts[1] = swizzleLane5(a);
   2110         break;
   2111     case 5:
   2112         verts[0] = swizzleLane5(a);
   2113         verts[1] = swizzleLane6(a);
   2114         break;
   2115     case 6:
   2116         verts[0] = swizzleLane6(a);
   2117         verts[1] = swizzleLane7(a);
   2118         break;
   2119     case 7:
   2120         verts[0] = swizzleLane7(a);
   2121         verts[1] = swizzleLane8(a);
   2122         break;
   2123     case 8:
   2124         verts[0] = swizzleLane8(a);
   2125         verts[1] = swizzleLane9(a);
   2126         break;
   2127     case 9:
   2128         verts[0] = swizzleLane9(a);
   2129         verts[1] = swizzleLaneA(a);
   2130         break;
   2131     case 10:
   2132         verts[0] = swizzleLaneA(a);
   2133         verts[1] = swizzleLaneB(a);
   2134         break;
   2135     case 11:
   2136         verts[0] = swizzleLaneB(a);
   2137         verts[1] = swizzleLaneC(a);
   2138         break;
   2139     case 12:
   2140         verts[0] = swizzleLaneC(a);
   2141         verts[1] = swizzleLaneD(a);
   2142         break;
   2143     case 13:
   2144         verts[0] = swizzleLaneD(a);
   2145         verts[1] = swizzleLaneE(a);
   2146         break;
   2147     case 14:
   2148         verts[0] = swizzleLaneE(a);
   2149         verts[1] = swizzleLaneF(a);
   2150         break;
   2151     case 15:
   2152         verts[0] = swizzleLaneF(a);
   2153         verts[1] = swizzleLane0(b);
   2154         break;
   2155     }
   2156 #else
   2157     const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
   2158     const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
   2159 
   2160     switch (primIndex)
   2161     {
   2162     case 0:
   2163         verts[0] = swizzleLane0(a);
   2164         verts[1] = swizzleLane1(a);
   2165         break;
   2166     case 1:
   2167         verts[0] = swizzleLane1(a);
   2168         verts[1] = swizzleLane2(a);
   2169         break;
   2170     case 2:
   2171         verts[0] = swizzleLane2(a);
   2172         verts[1] = swizzleLane3(a);
   2173         break;
   2174     case 3:
   2175         verts[0] = swizzleLane3(a);
   2176         verts[1] = swizzleLane4(a);
   2177         break;
   2178     case 4:
   2179         verts[0] = swizzleLane4(a);
   2180         verts[1] = swizzleLane5(a);
   2181         break;
   2182     case 5:
   2183         verts[0] = swizzleLane5(a);
   2184         verts[1] = swizzleLane6(a);
   2185         break;
   2186     case 6:
   2187         verts[0] = swizzleLane6(a);
   2188         verts[1] = swizzleLane7(a);
   2189         break;
   2190     case 7:
   2191         verts[0] = swizzleLane7(a);
   2192         verts[1] = swizzleLane0(b);
   2193         break;
   2194     }
   2195 #endif
   2196 }
   2197 
   2198 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   2199 {
   2200 #if USE_SIMD16_FRONTEND
   2201     simdvector a;
   2202 
   2203     const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
   2204 
   2205     if (!pa.useAlternateOffset)
   2206     {
   2207         for (uint32_t i = 0; i < 4; i += 1)
   2208         {
   2209             a[i] = _simd16_extract_ps(a_16[i], 0);
   2210         }
   2211     }
   2212     else
   2213     {
   2214         for (uint32_t i = 0; i < 4; i += 1)
   2215         {
   2216             a[i] = _simd16_extract_ps(a_16[i], 1);
   2217         }
   2218     }
   2219 
   2220 #else
   2221     simdvector &a = PaGetSimdVector(pa, 0, slot);
   2222 
   2223 #endif
   2224     verts[0] = a;  // points only have 1 vertex.
   2225 
   2226     SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   2227     return true;
   2228 }
   2229 
   2230 #if ENABLE_AVX512_SIMD16
   2231 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   2232 {
   2233     simd16vector &a = PaGetSimdVector_simd16(pa, pa.cur, slot);
   2234 
   2235     verts[0] = a;  // points only have 1 vertex.
   2236 
   2237     SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   2238     return true;
   2239 }
   2240 
   2241 #endif
   2242 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
   2243 {
   2244 #if USE_SIMD16_FRONTEND
   2245     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
   2246 
   2247     if (pa.useAlternateOffset)
   2248     {
   2249         primIndex += KNOB_SIMD_WIDTH;
   2250     }
   2251 
   2252     verts[0] = swizzleLaneN(a, primIndex);
   2253 #else
   2254     const simdvector &a = PaGetSimdVector(pa, 0, slot);
   2255 
   2256     verts[0] = swizzleLaneN(a, primIndex);
   2257 #endif
   2258 }
   2259 
   2260 //////////////////////////////////////////////////////////////////////////
   2261 /// @brief State 1 for RECT_LIST topology.
   2262 ///        There is not enough to assemble 8 triangles.
   2263 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
   2264 {
   2265     SetNextPaState(pa, PaRectList1, PaRectListSingle0);
   2266     return false;
   2267 }
   2268 
   2269 //////////////////////////////////////////////////////////////////////////
   2270 /// @brief State 1 for RECT_LIST topology.
   2271 ///   Rect lists has the following format.
   2272 ///             w          x          y           z
   2273 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
   2274 ///         | \ |      | \ |      | \ |       | \ |
   2275 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
   2276 ///            v0         v3         v6          v9
   2277 ///
   2278 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
   2279 ///
   2280 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
   2281 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
   2282 ///   etc.
   2283 ///
   2284 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
   2285 ///   where v0 contains all the first vertices for 8 triangles.
   2286 ///
   2287 ///     Result:
   2288 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
   2289 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
   2290 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
   2291 ///
   2292 /// @param pa - State for PA state machine.
   2293 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
   2294 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
   2295 bool PaRectList1(
   2296     PA_STATE_OPT& pa,
   2297     uint32_t slot,
   2298     simdvector verts[])
   2299 {
   2300     // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
   2301 #if USE_SIMD16_FRONTEND
   2302     simdvector a;
   2303     simdvector b;
   2304 
   2305     if (!pa.useAlternateOffset)
   2306     {
   2307         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
   2308 
   2309         for (uint32_t i = 0; i < 4; i += 1)
   2310         {
   2311             a[i] = _simd16_extract_ps(a_16[i], 0);
   2312             b[i] = _simd16_extract_ps(a_16[i], 1);
   2313         }
   2314     }
   2315     else
   2316     {
   2317         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
   2318 
   2319         for (uint32_t i = 0; i < 4; i += 1)
   2320         {
   2321             a[i] = _simd16_extract_ps(b_16[i], 0);
   2322             b[i] = _simd16_extract_ps(b_16[i], 1);;
   2323         }
   2324     }
   2325 
   2326 #else
   2327     simdvector &a = PaGetSimdVector(pa, 0, slot);           // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
   2328     simdvector &b = PaGetSimdVector(pa, 1, slot);           // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
   2329 
   2330 #endif
   2331     __m256 tmp0, tmp1, tmp2;
   2332 
   2333     // Loop over each component in the simdvector.
   2334     for(int i = 0; i < 4; ++i)
   2335     {
   2336         simdvector& v0 = verts[0];                          // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
   2337         tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);    // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
   2338         v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20);          //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
   2339         tmp1  = _mm256_permute_ps(v0[i], 0xF0);             // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
   2340         v0[i] = _mm256_permute_ps(v0[i], 0x5A);             //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
   2341         v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0);         //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
   2342 
   2343         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
   2344         ///      AVX2 should make this much cheaper.
   2345         simdvector& v1 = verts[1];                          // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
   2346         v1[i] = _mm256_permute_ps(a[i], 0x09);              //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
   2347         tmp1  = _mm256_permute_ps(a[i], 0x43);              // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
   2348         tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);         // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
   2349         tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);    // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
   2350         v1[i] = _mm256_permute_ps(tmp0, 0xE0);              //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
   2351         v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0);         //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
   2352         v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C);         //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
   2353 
   2354         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
   2355         simdvector& v2 = verts[2];                          // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
   2356         v2[i] = _mm256_permute_ps(tmp0, 0x30);              //   v2 = { *, *, *, *, v8, *, v11, * }
   2357         tmp1  = _mm256_permute_ps(tmp2, 0x31);              // tmp1 = { v2, *, v5, *, *, *, *, * }
   2358         v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
   2359 
   2360         // Need to compute 4th implied vertex for the rectangle.
   2361         tmp2  = _mm256_sub_ps(v0[i], v1[i]);
   2362         tmp2  = _mm256_add_ps(tmp2, v2[i]);                 // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
   2363         tmp2  = _mm256_permute_ps(tmp2, 0xA0);              // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
   2364         v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA);         //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
   2365     }
   2366 
   2367     SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   2368     return true;
   2369 }
   2370 
   2371 //////////////////////////////////////////////////////////////////////////
   2372 /// @brief State 2 for RECT_LIST topology.
   2373 ///        Not implemented unless there is a use case for more then 8 rects.
   2374 /// @param pa - State for PA state machine.
   2375 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
   2376 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
   2377 bool PaRectList2(
   2378     PA_STATE_OPT& pa,
   2379     uint32_t slot,
   2380     simdvector verts[])
   2381 {
   2382     SWR_INVALID("Is rect list used for anything other then clears?");
   2383     SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   2384     return true;
   2385 }
   2386 
   2387 #if ENABLE_AVX512_SIMD16
   2388 //////////////////////////////////////////////////////////////////////////
   2389 /// @brief State 1 for RECT_LIST topology.
   2390 ///        There is not enough to assemble 8 triangles.
   2391 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
   2392 {
   2393     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
   2394     return false;
   2395 }
   2396 
   2397 //////////////////////////////////////////////////////////////////////////
   2398 /// @brief State 1 for RECT_LIST topology.
   2399 ///   Rect lists has the following format.
   2400 ///             w          x          y           z
   2401 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
   2402 ///         | \ |      | \ |      | \ |       | \ |
   2403 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
   2404 ///            v0         v3         v6          v9
   2405 ///
   2406 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
   2407 ///
   2408 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
   2409 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
   2410 ///   etc.
   2411 ///
   2412 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
   2413 ///   where v0 contains all the first vertices for 8 triangles.
   2414 ///
   2415 ///     Result:
   2416 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
   2417 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
   2418 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
   2419 ///
   2420 /// @param pa - State for PA state machine.
   2421 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
   2422 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
   2423 bool PaRectList1_simd16(
   2424     PA_STATE_OPT& pa,
   2425     uint32_t slot,
   2426     simd16vector verts[])
   2427 {
   2428     simdvector a;
   2429     simdvector b;
   2430 
   2431     if (!pa.useAlternateOffset)
   2432     {
   2433         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7, v8, v9, v10, v11, v12, v13, v14, v15 }
   2434 
   2435         for (uint32_t i = 0; i < 4; i += 1)
   2436         {
   2437             a[i] = _simd16_extract_ps(a_16[i], 0);
   2438             b[i] = _simd16_extract_ps(a_16[i], 1);
   2439         }
   2440     }
   2441     else
   2442     {
   2443         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
   2444 
   2445         for (uint32_t i = 0; i < 4; i += 1)
   2446         {
   2447             a[i] = _simd16_extract_ps(b_16[i], 0);
   2448             b[i] = _simd16_extract_ps(b_16[i], 1);
   2449         }
   2450     }
   2451 
   2452     simd16vector &v0 = verts[0];                            // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
   2453     simd16vector &v1 = verts[1];                            // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
   2454     simd16vector &v2 = verts[2];                            // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
   2455 
   2456     // Loop over each component in the simdvector.
   2457     for (int i = 0; i < 4; i += 1)
   2458     {
   2459         simdscalar v0_lo;                                   // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
   2460         simdscalar v1_lo;                                   // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
   2461         simdscalar v2_lo;                                   // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
   2462 
   2463         __m256 tmp0, tmp1, tmp2;
   2464 
   2465         tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);    // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
   2466         v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20);          //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
   2467         tmp1 = _mm256_permute_ps(v0_lo, 0xF0);              // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
   2468         v0_lo = _mm256_permute_ps(v0_lo, 0x5A);             //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
   2469         v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0);         //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
   2470 
   2471         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
   2472         ///      AVX2 should make this much cheaper.
   2473         v1_lo = _mm256_permute_ps(a[i], 0x09);              //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
   2474         tmp1 = _mm256_permute_ps(a[i], 0x43);               // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
   2475         tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0);          // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
   2476         tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);     // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
   2477         v1_lo = _mm256_permute_ps(tmp0, 0xE0);              //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
   2478         v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0);         //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
   2479         v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C);         //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
   2480 
   2481         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
   2482         v2_lo = _mm256_permute_ps(tmp0, 0x30);              //   v2 = { *, *, *, *, v8, *, v11, * }
   2483         tmp1 = _mm256_permute_ps(tmp2, 0x31);               // tmp1 = { v2, *, v5, *, *, *, *, * }
   2484         v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
   2485 
   2486         // Need to compute 4th implied vertex for the rectangle.
   2487         tmp2 = _mm256_sub_ps(v0_lo, v1_lo);
   2488         tmp2 = _mm256_add_ps(tmp2, v2_lo);                  // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
   2489         tmp2 = _mm256_permute_ps(tmp2, 0xA0);               // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
   2490         v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA);         //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
   2491 
   2492         v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
   2493         v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
   2494         v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
   2495     }
   2496 
   2497     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   2498     return true;
   2499 }
   2500 
   2501 //////////////////////////////////////////////////////////////////////////
   2502 /// @brief State 2 for RECT_LIST topology.
   2503 ///        Not implemented unless there is a use case for more then 8 rects.
   2504 /// @param pa - State for PA state machine.
   2505 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
   2506 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
   2507 bool PaRectList2_simd16(
   2508     PA_STATE_OPT& pa,
   2509     uint32_t slot,
   2510     simd16vector verts[])
   2511 {
   2512     SWR_INVALID("Is rect list used for anything other then clears?");
   2513     SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
   2514     return true;
   2515 }
   2516 
   2517 #endif
   2518 //////////////////////////////////////////////////////////////////////////
   2519 /// @brief This procedure is called by the Binner to assemble the attributes.
   2520 ///        Unlike position, which is stored vertically, the attributes are
   2521 ///        stored horizontally. The outputs from the VS, labeled as 'a' and
   2522 ///        'b' are vertical. This function needs to transpose the lanes
   2523 ///        containing the vertical attribute data into horizontal form.
   2524 /// @param pa - State for PA state machine.
   2525 /// @param slot - Index into VS output for a given attribute.
   2526 /// @param primIndex - Binner processes each triangle individually.
   2527 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
   2528 void PaRectListSingle0(
   2529     PA_STATE_OPT& pa,
   2530     uint32_t slot,
   2531     uint32_t primIndex,
   2532     simd4scalar verts[])
   2533 {
   2534     // We have 12 simdscalars contained within 3 simdvectors which
   2535     // hold at least 8 triangles worth of data. We want to assemble a single
   2536     // triangle with data in horizontal form.
   2537 #if USE_SIMD16_FRONTEND
   2538     simdvector a;
   2539     simdvector b;
   2540 
   2541     if (!pa.useAlternateOffset)
   2542     {
   2543         const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
   2544 
   2545         for (uint32_t i = 0; i < 4; i += 1)
   2546         {
   2547             a[i] = _simd16_extract_ps(a_16[i], 0);
   2548             b[i] = _simd16_extract_ps(a_16[i], 1);
   2549         }
   2550     }
   2551     else
   2552     {
   2553         const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
   2554 
   2555         for (uint32_t i = 0; i < 4; i += 1)
   2556         {
   2557             a[i] = _simd16_extract_ps(b_16[i], 0);
   2558             b[i] = _simd16_extract_ps(b_16[i], 1);;
   2559         }
   2560     }
   2561 
   2562 #else
   2563     simdvector& a = PaGetSimdVector(pa, 0, slot);
   2564 
   2565 #endif
   2566     // Convert from vertical to horizontal.
   2567     switch(primIndex)
   2568     {
   2569     case 0:
   2570         verts[0] = swizzleLane0(a);
   2571         verts[1] = swizzleLane1(a);
   2572         verts[2] = swizzleLane2(a);
   2573         break;
   2574     case 1:
   2575         verts[0] = swizzleLane0(a);
   2576         verts[1] = swizzleLane2(a);
   2577         verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA);
   2578         break;
   2579     case 2:
   2580     case 3:
   2581     case 4:
   2582     case 5:
   2583     case 6:
   2584     case 7:
   2585         SWR_INVALID("Invalid primIndex: %d", primIndex);
   2586         break;
   2587     };
   2588 }
   2589 
   2590 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
   2591     uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) :
   2592     PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
   2593     cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
   2594 {
   2595     const API_STATE& state = GetApiState(pDC);
   2596 
   2597     this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
   2598 
   2599 #if ENABLE_AVX512_SIMD16
   2600     pfnPaFunc_simd16 = nullptr;
   2601 
   2602 #endif
   2603     switch (this->binTopology)
   2604     {
   2605         case TOP_TRIANGLE_LIST:
   2606             this->pfnPaFunc = PaTriList0;
   2607 #if ENABLE_AVX512_SIMD16
   2608             this->pfnPaFunc_simd16 = PaTriList0_simd16;
   2609 #endif
   2610             break;
   2611         case TOP_TRIANGLE_STRIP:
   2612             this->pfnPaFunc = PaTriStrip0;
   2613 #if ENABLE_AVX512_SIMD16
   2614             this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
   2615 #endif
   2616             break;
   2617         case TOP_TRIANGLE_FAN:
   2618             this->pfnPaFunc = PaTriFan0;
   2619 #if ENABLE_AVX512_SIMD16
   2620             this->pfnPaFunc_simd16 = PaTriFan0_simd16;
   2621 #endif
   2622             break;
   2623         case TOP_QUAD_LIST:
   2624             this->pfnPaFunc = PaQuadList0;
   2625 #if ENABLE_AVX512_SIMD16
   2626             this->pfnPaFunc_simd16 = PaQuadList0_simd16;
   2627 #endif
   2628             this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
   2629             break;
   2630         case TOP_QUAD_STRIP:
   2631             // quad strip pattern when decomposed into triangles is the same as verts strips
   2632             this->pfnPaFunc = PaTriStrip0;
   2633 #if ENABLE_AVX512_SIMD16
   2634             this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
   2635 #endif
   2636             this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
   2637             break;
   2638         case TOP_LINE_LIST:
   2639             this->pfnPaFunc = PaLineList0;
   2640 #if ENABLE_AVX512_SIMD16
   2641             this->pfnPaFunc_simd16 = PaLineList0_simd16;
   2642 #endif
   2643             this->numPrims = in_numPrims;
   2644             break;
   2645         case TOP_LINE_STRIP:
   2646             this->pfnPaFunc = PaLineStrip0;
   2647 #if ENABLE_AVX512_SIMD16
   2648             this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
   2649 #endif
   2650             this->numPrims = in_numPrims;
   2651             break;
   2652         case TOP_LINE_LOOP:
   2653             this->pfnPaFunc = PaLineLoop0;
   2654 #if ENABLE_AVX512_SIMD16
   2655             this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
   2656 #endif
   2657             this->numPrims = in_numPrims;
   2658             break;
   2659         case TOP_POINT_LIST:
   2660             this->pfnPaFunc = PaPoints0;
   2661 #if ENABLE_AVX512_SIMD16
   2662             this->pfnPaFunc_simd16 = PaPoints0_simd16;
   2663 #endif
   2664             this->numPrims = in_numPrims;
   2665             break;
   2666         case TOP_RECT_LIST:
   2667             this->pfnPaFunc = PaRectList0;
   2668 #if ENABLE_AVX512_SIMD16
   2669             this->pfnPaFunc_simd16 = PaRectList0_simd16;
   2670 #endif
   2671             this->numPrims = in_numPrims * 2;
   2672             break;
   2673 
   2674         case TOP_PATCHLIST_1:
   2675             this->pfnPaFunc = PaPatchList<1>;
   2676 #if ENABLE_AVX512_SIMD16
   2677             this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
   2678 #endif
   2679             break;
   2680         case TOP_PATCHLIST_2:
   2681             this->pfnPaFunc = PaPatchList<2>;
   2682 #if ENABLE_AVX512_SIMD16
   2683             this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
   2684 #endif
   2685             break;
   2686         case TOP_PATCHLIST_3:
   2687             this->pfnPaFunc = PaPatchList<3>;
   2688 #if ENABLE_AVX512_SIMD16
   2689             this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
   2690 #endif
   2691             break;
   2692         case TOP_PATCHLIST_4:
   2693             this->pfnPaFunc = PaPatchList<4>;
   2694 #if ENABLE_AVX512_SIMD16
   2695             this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
   2696 #endif
   2697             break;
   2698         case TOP_PATCHLIST_5:
   2699             this->pfnPaFunc = PaPatchList<5>;
   2700 #if ENABLE_AVX512_SIMD16
   2701             this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
   2702 #endif
   2703             break;
   2704         case TOP_PATCHLIST_6:
   2705             this->pfnPaFunc = PaPatchList<6>;
   2706 #if ENABLE_AVX512_SIMD16
   2707             this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
   2708 #endif
   2709             break;
   2710         case TOP_PATCHLIST_7:
   2711             this->pfnPaFunc = PaPatchList<7>;
   2712 #if ENABLE_AVX512_SIMD16
   2713             this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
   2714 #endif
   2715             break;
   2716         case TOP_PATCHLIST_8:
   2717             this->pfnPaFunc = PaPatchList<8>;
   2718 #if ENABLE_AVX512_SIMD16
   2719             this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
   2720 #endif
   2721             break;
   2722         case TOP_PATCHLIST_9:
   2723             this->pfnPaFunc = PaPatchList<9>;
   2724 #if ENABLE_AVX512_SIMD16
   2725             this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
   2726 #endif
   2727             break;
   2728         case TOP_PATCHLIST_10:
   2729             this->pfnPaFunc = PaPatchList<10>;
   2730 #if ENABLE_AVX512_SIMD16
   2731             this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
   2732 #endif
   2733             break;
   2734         case TOP_PATCHLIST_11:
   2735             this->pfnPaFunc = PaPatchList<11>;
   2736 #if ENABLE_AVX512_SIMD16
   2737             this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
   2738 #endif
   2739             break;
   2740         case TOP_PATCHLIST_12:
   2741             this->pfnPaFunc = PaPatchList<12>;
   2742 #if ENABLE_AVX512_SIMD16
   2743             this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
   2744 #endif
   2745             break;
   2746         case TOP_PATCHLIST_13:
   2747             this->pfnPaFunc = PaPatchList<13>;
   2748 #if ENABLE_AVX512_SIMD16
   2749             this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
   2750 #endif
   2751             break;
   2752         case TOP_PATCHLIST_14:
   2753             this->pfnPaFunc = PaPatchList<14>;
   2754 #if ENABLE_AVX512_SIMD16
   2755             this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
   2756 #endif
   2757             break;
   2758         case TOP_PATCHLIST_15:
   2759             this->pfnPaFunc = PaPatchList<15>;
   2760 #if ENABLE_AVX512_SIMD16
   2761             this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
   2762 #endif
   2763             break;
   2764         case TOP_PATCHLIST_16:
   2765             this->pfnPaFunc = PaPatchList<16>;
   2766 #if ENABLE_AVX512_SIMD16
   2767             this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
   2768 #endif
   2769             break;
   2770         case TOP_PATCHLIST_17:
   2771             this->pfnPaFunc = PaPatchList<17>;
   2772 #if ENABLE_AVX512_SIMD16
   2773             this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
   2774 #endif
   2775             break;
   2776         case TOP_PATCHLIST_18:
   2777             this->pfnPaFunc = PaPatchList<18>;
   2778 #if ENABLE_AVX512_SIMD16
   2779             this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
   2780 #endif
   2781             break;
   2782         case TOP_PATCHLIST_19:
   2783             this->pfnPaFunc = PaPatchList<19>;
   2784 #if ENABLE_AVX512_SIMD16
   2785             this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
   2786 #endif
   2787             break;
   2788         case TOP_PATCHLIST_20:
   2789             this->pfnPaFunc = PaPatchList<20>;
   2790 #if ENABLE_AVX512_SIMD16
   2791             this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
   2792 #endif
   2793             break;
   2794         case TOP_PATCHLIST_21:
   2795             this->pfnPaFunc = PaPatchList<21>;
   2796 #if ENABLE_AVX512_SIMD16
   2797             this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
   2798 #endif
   2799             break;
   2800         case TOP_PATCHLIST_22:
   2801             this->pfnPaFunc = PaPatchList<22>;
   2802 #if ENABLE_AVX512_SIMD16
   2803             this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
   2804 #endif
   2805             break;
   2806         case TOP_PATCHLIST_23:
   2807             this->pfnPaFunc = PaPatchList<23>;
   2808 #if ENABLE_AVX512_SIMD16
   2809             this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
   2810 #endif
   2811             break;
   2812         case TOP_PATCHLIST_24:
   2813             this->pfnPaFunc = PaPatchList<24>;
   2814 #if ENABLE_AVX512_SIMD16
   2815             this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
   2816 #endif
   2817             break;
   2818         case TOP_PATCHLIST_25:
   2819             this->pfnPaFunc = PaPatchList<25>;
   2820 #if ENABLE_AVX512_SIMD16
   2821             this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
   2822 #endif
   2823             break;
   2824         case TOP_PATCHLIST_26:
   2825             this->pfnPaFunc = PaPatchList<26>;
   2826 #if ENABLE_AVX512_SIMD16
   2827             this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
   2828 #endif
   2829             break;
   2830         case TOP_PATCHLIST_27:
   2831             this->pfnPaFunc = PaPatchList<27>;
   2832 #if ENABLE_AVX512_SIMD16
   2833             this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
   2834 #endif
   2835             break;
   2836         case TOP_PATCHLIST_28:
   2837             this->pfnPaFunc = PaPatchList<28>;
   2838 #if ENABLE_AVX512_SIMD16
   2839             this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
   2840 #endif
   2841             break;
   2842         case TOP_PATCHLIST_29:
   2843             this->pfnPaFunc = PaPatchList<29>;
   2844 #if ENABLE_AVX512_SIMD16
   2845             this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
   2846 #endif
   2847             break;
   2848         case TOP_PATCHLIST_30:
   2849             this->pfnPaFunc = PaPatchList<30>;
   2850 #if ENABLE_AVX512_SIMD16
   2851             this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
   2852 #endif
   2853             break;
   2854         case TOP_PATCHLIST_31:
   2855             this->pfnPaFunc = PaPatchList<31>;
   2856 #if ENABLE_AVX512_SIMD16
   2857             this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
   2858 #endif
   2859             break;
   2860         case TOP_PATCHLIST_32:
   2861             this->pfnPaFunc = PaPatchList<32>;
   2862 #if ENABLE_AVX512_SIMD16
   2863             this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
   2864 #endif
   2865             break;
   2866 
   2867         default:
   2868             SWR_INVALID("Invalid topology: %d", this->binTopology);
   2869             break;
   2870     };
   2871 
   2872     this->pfnPaFuncReset = this->pfnPaFunc;
   2873 #if ENABLE_AVX512_SIMD16
   2874     this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
   2875 #endif
   2876 
   2877 #if USE_SIMD16_FRONTEND
   2878     simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   2879     simd16scalari id82 = _simd16_set_epi32( 7,  7,  6,  6,  5,  5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
   2880 
   2881 #else
   2882     simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   2883     simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
   2884 
   2885 #endif
   2886     switch(this->binTopology)
   2887     {
   2888         case TOP_TRIANGLE_LIST:
   2889         case TOP_TRIANGLE_STRIP:
   2890         case TOP_TRIANGLE_FAN:
   2891         case TOP_LINE_STRIP:
   2892         case TOP_LINE_LIST:
   2893         case TOP_LINE_LOOP:
   2894 #if USE_SIMD16_FRONTEND
   2895             this->primIDIncr = 16;
   2896             this->primID = id16;
   2897 #else
   2898             this->primIDIncr = 8;
   2899             this->primID = id8;
   2900 #endif
   2901             break;
   2902         case TOP_QUAD_LIST:
   2903         case TOP_QUAD_STRIP:
   2904         case TOP_RECT_LIST:
   2905 #if USE_SIMD16_FRONTEND
   2906             this->primIDIncr = 8;
   2907             this->primID = id82;
   2908 #else
   2909             this->primIDIncr = 4;
   2910             this->primID = id4;
   2911 #endif
   2912             break;
   2913         case TOP_POINT_LIST:
   2914 #if USE_SIMD16_FRONTEND
   2915             this->primIDIncr = 16;
   2916             this->primID = id16;
   2917 #else
   2918             this->primIDIncr = 8;
   2919             this->primID = id8;
   2920 #endif
   2921             break;
   2922         case TOP_PATCHLIST_1:
   2923         case TOP_PATCHLIST_2:
   2924         case TOP_PATCHLIST_3:
   2925         case TOP_PATCHLIST_4:
   2926         case TOP_PATCHLIST_5:
   2927         case TOP_PATCHLIST_6:
   2928         case TOP_PATCHLIST_7:
   2929         case TOP_PATCHLIST_8:
   2930         case TOP_PATCHLIST_9:
   2931         case TOP_PATCHLIST_10:
   2932         case TOP_PATCHLIST_11:
   2933         case TOP_PATCHLIST_12:
   2934         case TOP_PATCHLIST_13:
   2935         case TOP_PATCHLIST_14:
   2936         case TOP_PATCHLIST_15:
   2937         case TOP_PATCHLIST_16:
   2938         case TOP_PATCHLIST_17:
   2939         case TOP_PATCHLIST_18:
   2940         case TOP_PATCHLIST_19:
   2941         case TOP_PATCHLIST_20:
   2942         case TOP_PATCHLIST_21:
   2943         case TOP_PATCHLIST_22:
   2944         case TOP_PATCHLIST_23:
   2945         case TOP_PATCHLIST_24:
   2946         case TOP_PATCHLIST_25:
   2947         case TOP_PATCHLIST_26:
   2948         case TOP_PATCHLIST_27:
   2949         case TOP_PATCHLIST_28:
   2950         case TOP_PATCHLIST_29:
   2951         case TOP_PATCHLIST_30:
   2952         case TOP_PATCHLIST_31:
   2953         case TOP_PATCHLIST_32:
   2954             // Always run KNOB_SIMD_WIDTH number of patches at a time.
   2955 #if USE_SIMD16_FRONTEND
   2956             this->primIDIncr = 16;
   2957             this->primID = id16;
   2958 #else
   2959             this->primIDIncr = 8;
   2960             this->primID = id8;
   2961 #endif
   2962             break;
   2963 
   2964         default:
   2965             SWR_INVALID("Invalid topology: %d", this->binTopology);
   2966             break;
   2967     };
   2968 
   2969 }
   2970 #endif
   2971