Home | History | Annotate | Download | only in llvmpipe
      1 /**************************************************************************
      2  *
      3  * Copyright 2007-2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 /*
     29  * Rasterization for binned triangles within a tile
     30  */
     31 
     32 #include <limits.h>
     33 #include "util/u_math.h"
     34 #include "lp_debug.h"
     35 #include "lp_perf.h"
     36 #include "lp_rast_priv.h"
     37 #include "lp_tile_soa.h"
     38 
     39 
     40 
     41 
     42 /**
     43  * Shade all pixels in a 4x4 block.
     44  */
     45 static void
     46 block_full_4(struct lp_rasterizer_task *task,
     47              const struct lp_rast_triangle *tri,
     48              int x, int y)
     49 {
     50    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
     51 }
     52 
     53 
     54 /**
     55  * Shade all pixels in a 16x16 block.
     56  */
     57 static void
     58 block_full_16(struct lp_rasterizer_task *task,
     59               const struct lp_rast_triangle *tri,
     60               int x, int y)
     61 {
     62    unsigned ix, iy;
     63    assert(x % 16 == 0);
     64    assert(y % 16 == 0);
     65    for (iy = 0; iy < 16; iy += 4)
     66       for (ix = 0; ix < 16; ix += 4)
     67 	 block_full_4(task, tri, x + ix, y + iy);
     68 }
     69 
     70 #if !defined(PIPE_ARCH_SSE)
     71 
     72 static INLINE unsigned
     73 build_mask_linear(int c, int dcdx, int dcdy)
     74 {
     75    int mask = 0;
     76 
     77    int c0 = c;
     78    int c1 = c0 + dcdy;
     79    int c2 = c1 + dcdy;
     80    int c3 = c2 + dcdy;
     81 
     82    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
     83    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
     84    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
     85    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
     86    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
     87    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
     88    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
     89    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
     90    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
     91    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
     92    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
     93    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
     94    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
     95    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
     96    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
     97    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
     98 
     99    return mask;
    100 }
    101 
    102 
    103 static INLINE void
    104 build_masks(int c,
    105 	    int cdiff,
    106 	    int dcdx,
    107 	    int dcdy,
    108 	    unsigned *outmask,
    109 	    unsigned *partmask)
    110 {
    111    *outmask |= build_mask_linear(c, dcdx, dcdy);
    112    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
    113 }
    114 
    115 void
    116 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    117                       const union lp_rast_cmd_arg arg)
    118 {
    119    union lp_rast_cmd_arg arg2;
    120    arg2.triangle.tri = arg.triangle.tri;
    121    arg2.triangle.plane_mask = (1<<3)-1;
    122    lp_rast_triangle_3(task, arg2);
    123 }
    124 
    125 void
    126 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
    127                       const union lp_rast_cmd_arg arg)
    128 {
    129    union lp_rast_cmd_arg arg2;
    130    arg2.triangle.tri = arg.triangle.tri;
    131    arg2.triangle.plane_mask = (1<<4)-1;
    132    lp_rast_triangle_4(task, arg2);
    133 }
    134 
    135 void
    136 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    137                       const union lp_rast_cmd_arg arg)
    138 {
    139    lp_rast_triangle_3_16(task, arg);
    140 }
    141 
    142 #else
    143 #include <emmintrin.h>
    144 #include "util/u_sse.h"
    145 
    146 
    147 static INLINE void
    148 build_masks(int c,
    149 	    int cdiff,
    150 	    int dcdx,
    151 	    int dcdy,
    152 	    unsigned *outmask,
    153 	    unsigned *partmask)
    154 {
    155    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    156    __m128i xdcdy = _mm_set1_epi32(dcdy);
    157 
    158    /* Get values across the quad
    159     */
    160    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
    161    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
    162    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
    163 
    164    {
    165       __m128i cstep01, cstep23, result;
    166 
    167       cstep01 = _mm_packs_epi32(cstep0, cstep1);
    168       cstep23 = _mm_packs_epi32(cstep2, cstep3);
    169       result = _mm_packs_epi16(cstep01, cstep23);
    170 
    171       *outmask |= _mm_movemask_epi8(result);
    172    }
    173 
    174 
    175    {
    176       __m128i cio4 = _mm_set1_epi32(cdiff);
    177       __m128i cstep01, cstep23, result;
    178 
    179       cstep0 = _mm_add_epi32(cstep0, cio4);
    180       cstep1 = _mm_add_epi32(cstep1, cio4);
    181       cstep2 = _mm_add_epi32(cstep2, cio4);
    182       cstep3 = _mm_add_epi32(cstep3, cio4);
    183 
    184       cstep01 = _mm_packs_epi32(cstep0, cstep1);
    185       cstep23 = _mm_packs_epi32(cstep2, cstep3);
    186       result = _mm_packs_epi16(cstep01, cstep23);
    187 
    188       *partmask |= _mm_movemask_epi8(result);
    189    }
    190 }
    191 
    192 
    193 static INLINE unsigned
    194 build_mask_linear(int c, int dcdx, int dcdy)
    195 {
    196    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    197    __m128i xdcdy = _mm_set1_epi32(dcdy);
    198 
    199    /* Get values across the quad
    200     */
    201    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
    202    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
    203    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
    204 
    205    /* pack pairs of results into epi16
    206     */
    207    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
    208    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
    209 
    210    /* pack into epi8, preserving sign bits
    211     */
    212    __m128i result = _mm_packs_epi16(cstep01, cstep23);
    213 
    214    /* extract sign bits to create mask
    215     */
    216    return _mm_movemask_epi8(result);
    217 }
    218 
    219 static INLINE unsigned
    220 sign_bits4(const __m128i *cstep, int cdiff)
    221 {
    222 
    223    /* Adjust the step values
    224     */
    225    __m128i cio4 = _mm_set1_epi32(cdiff);
    226    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
    227    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
    228    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
    229    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
    230 
    231    /* Pack down to epi8
    232     */
    233    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
    234    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
    235    __m128i result = _mm_packs_epi16(cstep01, cstep23);
    236 
    237    /* Extract the sign bits
    238     */
    239    return _mm_movemask_epi8(result);
    240 }
    241 
    242 
    243 #define NR_PLANES 3
    244 
    245 
    246 
    247 
    248 
    249 
    250 
    251 void
    252 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    253                       const union lp_rast_cmd_arg arg)
    254 {
    255    const struct lp_rast_triangle *tri = arg.triangle.tri;
    256    const struct lp_rast_plane *plane = GET_PLANES(tri);
    257    int x = (arg.triangle.plane_mask & 0xff) + task->x;
    258    int y = (arg.triangle.plane_mask >> 8) + task->y;
    259    unsigned i, j;
    260 
    261    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    262    unsigned nr = 0;
    263 
    264    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
    265    __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
    266    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
    267    __m128i zero = _mm_setzero_si128();
    268 
    269    __m128i c;
    270    __m128i dcdx;
    271    __m128i dcdy;
    272    __m128i rej4;
    273 
    274    __m128i dcdx2;
    275    __m128i dcdx3;
    276 
    277    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    278    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    279    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    280    __m128i unused;
    281 
    282    transpose4_epi32(&p0, &p1, &p2, &zero,
    283                     &c, &dcdx, &dcdy, &rej4);
    284 
    285    /* Adjust dcdx;
    286     */
    287    dcdx = _mm_sub_epi32(zero, dcdx);
    288 
    289    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
    290    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
    291    rej4 = _mm_slli_epi32(rej4, 2);
    292 
    293    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
    294    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
    295    rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
    296 
    297    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    298    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
    299 
    300    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
    301                     &span_0, &span_1, &span_2, &unused);
    302 
    303    for (i = 0; i < 4; i++) {
    304       __m128i cx = c;
    305 
    306       for (j = 0; j < 4; j++) {
    307          __m128i c4rej = _mm_add_epi32(cx, rej4);
    308          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
    309 
    310          /* if (is_zero(rej_masks)) */
    311          if (_mm_movemask_epi8(rej_masks) == 0) {
    312             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
    313             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
    314             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
    315 
    316             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
    317 
    318             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
    319             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
    320             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
    321 
    322             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
    323             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
    324 
    325             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
    326             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
    327             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
    328 
    329             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
    330 
    331             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
    332             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
    333             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
    334 
    335             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
    336             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
    337             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
    338 
    339             unsigned mask = _mm_movemask_epi8(c_0123);
    340 
    341             out[nr].i = i;
    342             out[nr].j = j;
    343             out[nr].mask = mask;
    344             if (mask != 0xffff)
    345                nr++;
    346          }
    347          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
    348       }
    349 
    350       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
    351    }
    352 
    353    for (i = 0; i < nr; i++)
    354       lp_rast_shade_quads_mask(task,
    355                                &tri->inputs,
    356                                x + 4 * out[i].j,
    357                                y + 4 * out[i].i,
    358                                0xffff & ~out[i].mask);
    359 }
    360 
    361 
    362 
    363 
    364 
    365 void
    366 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    367                      const union lp_rast_cmd_arg arg)
    368 {
    369    const struct lp_rast_triangle *tri = arg.triangle.tri;
    370    const struct lp_rast_plane *plane = GET_PLANES(tri);
    371    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
    372    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
    373 
    374    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
    375    __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
    376    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
    377    __m128i zero = _mm_setzero_si128();
    378 
    379    __m128i c;
    380    __m128i dcdx;
    381    __m128i dcdy;
    382 
    383    __m128i dcdx2;
    384    __m128i dcdx3;
    385 
    386    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    387    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    388    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    389    __m128i unused;
    390 
    391    transpose4_epi32(&p0, &p1, &p2, &zero,
    392                     &c, &dcdx, &dcdy, &unused);
    393 
    394    /* Adjust dcdx;
    395     */
    396    dcdx = _mm_sub_epi32(zero, dcdx);
    397 
    398    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
    399    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
    400 
    401    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
    402    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
    403 
    404    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    405    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
    406 
    407    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
    408                     &span_0, &span_1, &span_2, &unused);
    409 
    410 
    411    {
    412       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
    413       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
    414       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
    415 
    416       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
    417 
    418       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
    419       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
    420       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
    421 
    422       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
    423       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
    424 
    425       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
    426       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
    427       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
    428 
    429       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
    430 
    431       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
    432       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
    433       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
    434 
    435       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
    436       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
    437       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
    438 
    439       unsigned mask = _mm_movemask_epi8(c_0123);
    440 
    441       if (mask != 0xffff)
    442          lp_rast_shade_quads_mask(task,
    443                                   &tri->inputs,
    444                                   x,
    445                                   y,
    446                                   0xffff & ~mask);
    447    }
    448 }
    449 
    450 #undef NR_PLANES
    451 #endif
    452 
    453 
    454 
    455 
    456 #define TAG(x) x##_1
    457 #define NR_PLANES 1
    458 #include "lp_rast_tri_tmp.h"
    459 
    460 #define TAG(x) x##_2
    461 #define NR_PLANES 2
    462 #include "lp_rast_tri_tmp.h"
    463 
    464 #define TAG(x) x##_3
    465 #define NR_PLANES 3
    466 /*#define TRI_4 lp_rast_triangle_3_4*/
    467 /*#define TRI_16 lp_rast_triangle_3_16*/
    468 #include "lp_rast_tri_tmp.h"
    469 
    470 #define TAG(x) x##_4
    471 #define NR_PLANES 4
    472 #define TRI_16 lp_rast_triangle_4_16
    473 #include "lp_rast_tri_tmp.h"
    474 
    475 #define TAG(x) x##_5
    476 #define NR_PLANES 5
    477 #include "lp_rast_tri_tmp.h"
    478 
    479 #define TAG(x) x##_6
    480 #define NR_PLANES 6
    481 #include "lp_rast_tri_tmp.h"
    482 
    483 #define TAG(x) x##_7
    484 #define NR_PLANES 7
    485 #include "lp_rast_tri_tmp.h"
    486 
    487 #define TAG(x) x##_8
    488 #define NR_PLANES 8
    489 #include "lp_rast_tri_tmp.h"
    490 
    491