Home | History | Annotate | Download | only in llvmpipe
      1 /**************************************************************************
      2  *
      3  * Copyright 2007-2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 /*
     29  * Rasterization for binned triangles within a tile
     30  */
     31 
     32 #include <limits.h>
     33 #include "util/u_math.h"
     34 #include "lp_debug.h"
     35 #include "lp_perf.h"
     36 #include "lp_rast_priv.h"
     37 
     38 /**
     39  * Shade all pixels in a 4x4 block.
     40  */
     41 static void
     42 block_full_4(struct lp_rasterizer_task *task,
     43              const struct lp_rast_triangle *tri,
     44              int x, int y)
     45 {
     46    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
     47 }
     48 
     49 
     50 /**
     51  * Shade all pixels in a 16x16 block.
     52  */
     53 static void
     54 block_full_16(struct lp_rasterizer_task *task,
     55               const struct lp_rast_triangle *tri,
     56               int x, int y)
     57 {
     58    unsigned ix, iy;
     59    assert(x % 16 == 0);
     60    assert(y % 16 == 0);
     61    for (iy = 0; iy < 16; iy += 4)
     62       for (ix = 0; ix < 16; ix += 4)
     63 	 block_full_4(task, tri, x + ix, y + iy);
     64 }
     65 
     66 static inline unsigned
     67 build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
     68 {
     69    unsigned mask = 0;
     70 
     71    int32_t c0 = c;
     72    int32_t c1 = c0 + dcdy;
     73    int32_t c2 = c1 + dcdy;
     74    int32_t c3 = c2 + dcdy;
     75 
     76    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
     77    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
     78    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
     79    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
     80    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
     81    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
     82    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
     83    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
     84    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
     85    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
     86    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
     87    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
     88    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
     89    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
     90    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
     91    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
     92 
     93    return mask;
     94 }
     95 
     96 
     97 static inline void
     98 build_masks(int32_t c,
     99             int32_t cdiff,
    100             int32_t dcdx,
    101             int32_t dcdy,
    102             unsigned *outmask,
    103             unsigned *partmask)
    104 {
    105    *outmask |= build_mask_linear(c, dcdx, dcdy);
    106    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
    107 }
    108 
    109 void
    110 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    111                       const union lp_rast_cmd_arg arg)
    112 {
    113    union lp_rast_cmd_arg arg2;
    114    arg2.triangle.tri = arg.triangle.tri;
    115    arg2.triangle.plane_mask = (1<<3)-1;
    116    lp_rast_triangle_3(task, arg2);
    117 }
    118 
    119 void
    120 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    121                       const union lp_rast_cmd_arg arg)
    122 {
    123    lp_rast_triangle_3_16(task, arg);
    124 }
    125 
    126 void
    127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
    128                       const union lp_rast_cmd_arg arg)
    129 {
    130    union lp_rast_cmd_arg arg2;
    131    arg2.triangle.tri = arg.triangle.tri;
    132    arg2.triangle.plane_mask = (1<<4)-1;
    133    lp_rast_triangle_4(task, arg2);
    134 }
    135 
    136 #if defined(PIPE_ARCH_SSE)
    137 
    138 #include <emmintrin.h>
    139 #include "util/u_sse.h"
    140 
    141 
    142 static inline void
    143 build_masks_sse(int c,
    144                 int cdiff,
    145                 int dcdx,
    146                 int dcdy,
    147                 unsigned *outmask,
    148                 unsigned *partmask)
    149 {
    150    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    151    __m128i xdcdy = _mm_set1_epi32(dcdy);
    152 
    153    /* Get values across the quad
    154     */
    155    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
    156    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
    157    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
    158 
    159    {
    160       __m128i cstep01, cstep23, result;
    161 
    162       cstep01 = _mm_packs_epi32(cstep0, cstep1);
    163       cstep23 = _mm_packs_epi32(cstep2, cstep3);
    164       result = _mm_packs_epi16(cstep01, cstep23);
    165 
    166       *outmask |= _mm_movemask_epi8(result);
    167    }
    168 
    169 
    170    {
    171       __m128i cio4 = _mm_set1_epi32(cdiff);
    172       __m128i cstep01, cstep23, result;
    173 
    174       cstep0 = _mm_add_epi32(cstep0, cio4);
    175       cstep1 = _mm_add_epi32(cstep1, cio4);
    176       cstep2 = _mm_add_epi32(cstep2, cio4);
    177       cstep3 = _mm_add_epi32(cstep3, cio4);
    178 
    179       cstep01 = _mm_packs_epi32(cstep0, cstep1);
    180       cstep23 = _mm_packs_epi32(cstep2, cstep3);
    181       result = _mm_packs_epi16(cstep01, cstep23);
    182 
    183       *partmask |= _mm_movemask_epi8(result);
    184    }
    185 }
    186 
    187 
    188 static inline unsigned
    189 build_mask_linear_sse(int c, int dcdx, int dcdy)
    190 {
    191    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    192    __m128i xdcdy = _mm_set1_epi32(dcdy);
    193 
    194    /* Get values across the quad
    195     */
    196    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
    197    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
    198    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
    199 
    200    /* pack pairs of results into epi16
    201     */
    202    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
    203    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
    204 
    205    /* pack into epi8, preserving sign bits
    206     */
    207    __m128i result = _mm_packs_epi16(cstep01, cstep23);
    208 
    209    /* extract sign bits to create mask
    210     */
    211    return _mm_movemask_epi8(result);
    212 }
    213 
    214 static inline unsigned
    215 sign_bits4(const __m128i *cstep, int cdiff)
    216 {
    217 
    218    /* Adjust the step values
    219     */
    220    __m128i cio4 = _mm_set1_epi32(cdiff);
    221    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
    222    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
    223    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
    224    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
    225 
    226    /* Pack down to epi8
    227     */
    228    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
    229    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
    230    __m128i result = _mm_packs_epi16(cstep01, cstep23);
    231 
    232    /* Extract the sign bits
    233     */
    234    return _mm_movemask_epi8(result);
    235 }
    236 
    237 
    238 #define NR_PLANES 3
    239 
    240 void
    241 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    242                          const union lp_rast_cmd_arg arg)
    243 {
    244    const struct lp_rast_triangle *tri = arg.triangle.tri;
    245    const struct lp_rast_plane *plane = GET_PLANES(tri);
    246    int x = (arg.triangle.plane_mask & 0xff) + task->x;
    247    int y = (arg.triangle.plane_mask >> 8) + task->y;
    248    unsigned i, j;
    249 
    250    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    251    unsigned nr = 0;
    252 
    253    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
    254    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
    255    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
    256    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    257    __m128i zero = _mm_setzero_si128();
    258 
    259    __m128i c, dcdx, dcdy, rej4;
    260    __m128i dcdx_neg_mask, dcdy_neg_mask;
    261    __m128i dcdx2, dcdx3;
    262 
    263    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    264    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    265    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    266    __m128i unused;
    267 
    268    transpose4_epi32(&p0, &p1, &p2, &zero,
    269                     &c, &unused, &dcdx, &dcdy);
    270 
    271    /* recalc eo - easier than trying to load as scalars / shuffle... */
    272    dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
    273    dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
    274    rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
    275                         _mm_and_si128(dcdx_neg_mask, dcdx));
    276 
    277    /* Adjust dcdx;
    278     */
    279    dcdx = _mm_sub_epi32(zero, dcdx);
    280 
    281    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
    282    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
    283    rej4 = _mm_slli_epi32(rej4, 2);
    284 
    285    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
    286    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
    287    rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
    288 
    289    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    290    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
    291 
    292    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
    293                     &span_0, &span_1, &span_2, &unused);
    294 
    295    for (i = 0; i < 4; i++) {
    296       __m128i cx = c;
    297 
    298       for (j = 0; j < 4; j++) {
    299          __m128i c4rej = _mm_add_epi32(cx, rej4);
    300          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
    301 
    302          /* if (is_zero(rej_masks)) */
    303          if (_mm_movemask_epi8(rej_masks) == 0) {
    304             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
    305             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
    306             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
    307 
    308             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
    309 
    310             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
    311             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
    312             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
    313 
    314             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
    315             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
    316 
    317             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
    318             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
    319             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
    320 
    321             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
    322 
    323             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
    324             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
    325             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
    326 
    327             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
    328             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
    329             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
    330 
    331             unsigned mask = _mm_movemask_epi8(c_0123);
    332 
    333             out[nr].i = i;
    334             out[nr].j = j;
    335             out[nr].mask = mask;
    336             if (mask != 0xffff)
    337                nr++;
    338          }
    339          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
    340       }
    341 
    342       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
    343    }
    344 
    345    for (i = 0; i < nr; i++)
    346       lp_rast_shade_quads_mask(task,
    347                                &tri->inputs,
    348                                x + 4 * out[i].j,
    349                                y + 4 * out[i].i,
    350                                0xffff & ~out[i].mask);
    351 }
    352 
    353 void
    354 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
    355                         const union lp_rast_cmd_arg arg)
    356 {
    357    const struct lp_rast_triangle *tri = arg.triangle.tri;
    358    const struct lp_rast_plane *plane = GET_PLANES(tri);
    359    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
    360    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
    361 
    362    /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
    363    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
    364    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
    365    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    366    __m128i zero = _mm_setzero_si128();
    367 
    368    __m128i c, dcdx, dcdy;
    369    __m128i dcdx2, dcdx3;
    370 
    371    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    372    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    373    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    374    __m128i unused;
    375 
    376    transpose4_epi32(&p0, &p1, &p2, &zero,
    377                     &c, &unused, &dcdx, &dcdy);
    378 
    379    /* Adjust dcdx;
    380     */
    381    dcdx = _mm_sub_epi32(zero, dcdx);
    382 
    383    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
    384    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
    385 
    386    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
    387    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
    388 
    389    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    390    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
    391 
    392    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
    393                     &span_0, &span_1, &span_2, &unused);
    394 
    395 
    396    {
    397       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
    398       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
    399       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
    400 
    401       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
    402 
    403       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
    404       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
    405       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
    406 
    407       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
    408       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
    409 
    410       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
    411       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
    412       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
    413 
    414       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
    415 
    416       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
    417       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
    418       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
    419 
    420       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
    421       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
    422       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
    423 
    424       unsigned mask = _mm_movemask_epi8(c_0123);
    425 
    426       if (mask != 0xffff)
    427          lp_rast_shade_quads_mask(task,
    428                                   &tri->inputs,
    429                                   x,
    430                                   y,
    431                                   0xffff & ~mask);
    432    }
    433 }
    434 
    435 #undef NR_PLANES
    436 
    437 #else
    438 
    439 #if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
    440 
    441 #include <altivec.h>
    442 #include "util/u_pwr8.h"
    443 
    444 static inline void
    445 build_masks_ppc(int c,
    446                 int cdiff,
    447                 int dcdx,
    448                 int dcdy,
    449                 unsigned *outmask,
    450                 unsigned *partmask)
    451 {
    452    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    453    __m128i xdcdy = (__m128i) vec_splats(dcdy);
    454 
    455    /* Get values across the quad
    456     */
    457    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
    458    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
    459    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
    460 
    461    {
    462       __m128i cstep01, cstep23, result;
    463 
    464       cstep01 = vec_packs_epi32(cstep0, cstep1);
    465       cstep23 = vec_packs_epi32(cstep2, cstep3);
    466       result = vec_packs_epi16(cstep01, cstep23);
    467 
    468       *outmask |= vec_movemask_epi8(result);
    469    }
    470 
    471 
    472    {
    473       __m128i cio4 = (__m128i) vec_splats(cdiff);
    474       __m128i cstep01, cstep23, result;
    475 
    476       cstep0 = vec_add_epi32(cstep0, cio4);
    477       cstep1 = vec_add_epi32(cstep1, cio4);
    478       cstep2 = vec_add_epi32(cstep2, cio4);
    479       cstep3 = vec_add_epi32(cstep3, cio4);
    480 
    481       cstep01 = vec_packs_epi32(cstep0, cstep1);
    482       cstep23 = vec_packs_epi32(cstep2, cstep3);
    483       result = vec_packs_epi16(cstep01, cstep23);
    484 
    485       *partmask |= vec_movemask_epi8(result);
    486    }
    487 }
    488 
    489 static inline unsigned
    490 build_mask_linear_ppc(int c, int dcdx, int dcdy)
    491 {
    492    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    493    __m128i xdcdy = (__m128i) vec_splats(dcdy);
    494 
    495    /* Get values across the quad
    496     */
    497    __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
    498    __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
    499    __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
    500 
    501    /* pack pairs of results into epi16
    502     */
    503    __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
    504    __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
    505 
    506    /* pack into epi8, preserving sign bits
    507     */
    508    __m128i result = vec_packs_epi16(cstep01, cstep23);
    509 
    510    /* extract sign bits to create mask
    511     */
    512    return vec_movemask_epi8(result);
    513 }
    514 
    515 static inline __m128i
    516 lp_plane_to_m128i(const struct lp_rast_plane *plane)
    517 {
    518    return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
    519                          (int32_t)plane->dcdy, (int32_t)plane->eo);
    520 }
    521 
    522 #define NR_PLANES 3
    523 
    524 void
    525 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    526                       const union lp_rast_cmd_arg arg)
    527 {
    528    const struct lp_rast_triangle *tri = arg.triangle.tri;
    529    const struct lp_rast_plane *plane = GET_PLANES(tri);
    530    int x = (arg.triangle.plane_mask & 0xff) + task->x;
    531    int y = (arg.triangle.plane_mask >> 8) + task->y;
    532    unsigned i, j;
    533 
    534    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    535    unsigned nr = 0;
    536 
    537    __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
    538    __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
    539    __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
    540    __m128i zero = vec_splats((unsigned char) 0);
    541 
    542    __m128i c;
    543    __m128i dcdx;
    544    __m128i dcdy;
    545    __m128i rej4;
    546 
    547    __m128i dcdx2;
    548    __m128i dcdx3;
    549 
    550    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    551    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    552    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    553    __m128i unused;
    554 
    555    __m128i vshuf_mask0;
    556    __m128i vshuf_mask1;
    557    __m128i vshuf_mask2;
    558 
    559 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    560    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
    561    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
    562    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
    563 #else
    564    vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
    565    vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
    566    vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
    567 #endif
    568 
    569    transpose4_epi32(&p0, &p1, &p2, &zero,
    570                     &c, &dcdx, &dcdy, &rej4);
    571 
    572    /* Adjust dcdx;
    573     */
    574    dcdx = vec_sub_epi32(zero, dcdx);
    575 
    576    c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
    577    c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
    578    rej4 = vec_slli_epi32(rej4, 2);
    579 
    580    /*
    581     * Adjust so we can just check the sign bit (< 0 comparison),
    582     * instead of having to do a less efficient <= 0 comparison
    583     */
    584    c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
    585    rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
    586 
    587    dcdx2 = vec_add_epi32(dcdx, dcdx);
    588    dcdx3 = vec_add_epi32(dcdx2, dcdx);
    589 
    590    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
    591                     &span_0, &span_1, &span_2, &unused);
    592 
    593    for (i = 0; i < 4; i++) {
    594       __m128i cx = c;
    595 
    596       for (j = 0; j < 4; j++) {
    597          __m128i c4rej = vec_add_epi32(cx, rej4);
    598          __m128i rej_masks = vec_srai_epi32(c4rej, 31);
    599 
    600          /* if (is_zero(rej_masks)) */
    601          if (vec_movemask_epi8(rej_masks) == 0) {
    602             __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
    603             __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
    604             __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);
    605 
    606             __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
    607 
    608             __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
    609             __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
    610             __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));
    611 
    612             __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
    613             __m128i c_01 = vec_packs_epi32(c_0, c_1);
    614 
    615             __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
    616             __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
    617             __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));
    618 
    619             __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);
    620 
    621             __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
    622             __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
    623             __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));
    624 
    625             __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
    626             __m128i c_23 = vec_packs_epi32(c_2, c_3);
    627             __m128i c_0123 = vec_packs_epi16(c_01, c_23);
    628 
    629             unsigned mask = vec_movemask_epi8(c_0123);
    630 
    631             out[nr].i = i;
    632             out[nr].j = j;
    633             out[nr].mask = mask;
    634             if (mask != 0xffff)
    635                nr++;
    636          }
    637          cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
    638       }
    639 
    640       c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
    641    }
    642 
    643    for (i = 0; i < nr; i++)
    644       lp_rast_shade_quads_mask(task,
    645                                &tri->inputs,
    646                                x + 4 * out[i].j,
    647                                y + 4 * out[i].i,
    648                                0xffff & ~out[i].mask);
    649 }
    650 
    651 #undef NR_PLANES
    652 
    653 #else
    654 
    655 void
    656 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    657                          const union lp_rast_cmd_arg arg)
    658 {
    659    union lp_rast_cmd_arg arg2;
    660    arg2.triangle.tri = arg.triangle.tri;
    661    arg2.triangle.plane_mask = (1<<3)-1;
    662    lp_rast_triangle_32_3(task, arg2);
    663 }
    664 
    665 #endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
    666 
    667 void
    668 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
    669                          const union lp_rast_cmd_arg arg)
    670 {
    671    union lp_rast_cmd_arg arg2;
    672    arg2.triangle.tri = arg.triangle.tri;
    673    arg2.triangle.plane_mask = (1<<4)-1;
    674    lp_rast_triangle_32_4(task, arg2);
    675 }
    676 
    677 void
    678 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
    679                       const union lp_rast_cmd_arg arg)
    680 {
    681    lp_rast_triangle_32_3_16(task, arg);
    682 }
    683 
    684 #endif
    685 
    686 
    687 #if defined PIPE_ARCH_SSE
    688 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
    689 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
    690 #elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
    691 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
    692 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
    693 #else
    694 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
    695 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
    696 #endif
    697 
    698 #define RASTER_64 1
    699 
    700 #define TAG(x) x##_1
    701 #define NR_PLANES 1
    702 #include "lp_rast_tri_tmp.h"
    703 
    704 #define TAG(x) x##_2
    705 #define NR_PLANES 2
    706 #include "lp_rast_tri_tmp.h"
    707 
    708 #define TAG(x) x##_3
    709 #define NR_PLANES 3
    710 /*#define TRI_4 lp_rast_triangle_3_4*/
    711 /*#define TRI_16 lp_rast_triangle_3_16*/
    712 #include "lp_rast_tri_tmp.h"
    713 
    714 #define TAG(x) x##_4
    715 #define NR_PLANES 4
    716 /*#define TRI_16 lp_rast_triangle_4_16*/
    717 #include "lp_rast_tri_tmp.h"
    718 
    719 #define TAG(x) x##_5
    720 #define NR_PLANES 5
    721 #include "lp_rast_tri_tmp.h"
    722 
    723 #define TAG(x) x##_6
    724 #define NR_PLANES 6
    725 #include "lp_rast_tri_tmp.h"
    726 
    727 #define TAG(x) x##_7
    728 #define NR_PLANES 7
    729 #include "lp_rast_tri_tmp.h"
    730 
    731 #define TAG(x) x##_8
    732 #define NR_PLANES 8
    733 #include "lp_rast_tri_tmp.h"
    734 
    735 #undef RASTER_64
    736 
    737 #define TAG(x) x##_32_1
    738 #define NR_PLANES 1
    739 #include "lp_rast_tri_tmp.h"
    740 
    741 #define TAG(x) x##_32_2
    742 #define NR_PLANES 2
    743 #include "lp_rast_tri_tmp.h"
    744 
    745 #define TAG(x) x##_32_3
    746 #define NR_PLANES 3
    747 /*#define TRI_4 lp_rast_triangle_3_4*/
    748 /*#define TRI_16 lp_rast_triangle_3_16*/
    749 #include "lp_rast_tri_tmp.h"
    750 
    751 #define TAG(x) x##_32_4
    752 #define NR_PLANES 4
    753 #ifdef PIPE_ARCH_SSE
    754 #define TRI_16 lp_rast_triangle_32_4_16
    755 #endif
    756 #include "lp_rast_tri_tmp.h"
    757 
    758 #define TAG(x) x##_32_5
    759 #define NR_PLANES 5
    760 #include "lp_rast_tri_tmp.h"
    761 
    762 #define TAG(x) x##_32_6
    763 #define NR_PLANES 6
    764 #include "lp_rast_tri_tmp.h"
    765 
    766 #define TAG(x) x##_32_7
    767 #define NR_PLANES 7
    768 #include "lp_rast_tri_tmp.h"
    769 
    770 #define TAG(x) x##_32_8
    771 #define NR_PLANES 8
    772 #include "lp_rast_tri_tmp.h"
    773 
    774