1 /************************************************************************** 2 * 3 * Copyright 2007-2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 /* 29 * Rasterization for binned triangles within a tile 30 */ 31 32 #include <limits.h> 33 #include "util/u_math.h" 34 #include "lp_debug.h" 35 #include "lp_perf.h" 36 #include "lp_rast_priv.h" 37 38 /** 39 * Shade all pixels in a 4x4 block. 40 */ 41 static void 42 block_full_4(struct lp_rasterizer_task *task, 43 const struct lp_rast_triangle *tri, 44 int x, int y) 45 { 46 lp_rast_shade_quads_all(task, &tri->inputs, x, y); 47 } 48 49 50 /** 51 * Shade all pixels in a 16x16 block. 52 */ 53 static void 54 block_full_16(struct lp_rasterizer_task *task, 55 const struct lp_rast_triangle *tri, 56 int x, int y) 57 { 58 unsigned ix, iy; 59 assert(x % 16 == 0); 60 assert(y % 16 == 0); 61 for (iy = 0; iy < 16; iy += 4) 62 for (ix = 0; ix < 16; ix += 4) 63 block_full_4(task, tri, x + ix, y + iy); 64 } 65 66 static inline unsigned 67 build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy) 68 { 69 unsigned mask = 0; 70 71 int32_t c0 = c; 72 int32_t c1 = c0 + dcdy; 73 int32_t c2 = c1 + dcdy; 74 int32_t c3 = c2 + dcdy; 75 76 mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); 77 mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); 78 mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); 79 mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); 80 mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); 81 mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); 82 mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); 83 mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 84 mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); 85 mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); 86 mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); 87 mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); 88 mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); 89 mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); 90 mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); 91 mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); 92 93 return mask; 94 } 95 96 97 static inline void 98 build_masks(int32_t c, 99 int32_t cdiff, 100 int32_t dcdx, 101 int32_t dcdy, 102 unsigned *outmask, 103 unsigned *partmask) 104 { 105 *outmask |= build_mask_linear(c, dcdx, dcdy); 106 *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); 107 } 108 109 void 110 lp_rast_triangle_3_16(struct lp_rasterizer_task *task, 111 const union lp_rast_cmd_arg arg) 112 { 113 union lp_rast_cmd_arg arg2; 114 arg2.triangle.tri = arg.triangle.tri; 115 arg2.triangle.plane_mask = (1<<3)-1; 116 lp_rast_triangle_3(task, arg2); 117 } 118 119 void 120 lp_rast_triangle_3_4(struct lp_rasterizer_task *task, 121 const union lp_rast_cmd_arg arg) 122 { 123 lp_rast_triangle_3_16(task, arg); 124 } 125 126 void 127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task, 128 const union lp_rast_cmd_arg arg) 129 { 130 union lp_rast_cmd_arg arg2; 131 arg2.triangle.tri = arg.triangle.tri; 132 arg2.triangle.plane_mask = (1<<4)-1; 133 lp_rast_triangle_4(task, arg2); 134 } 135 136 #if defined(PIPE_ARCH_SSE) 137 138 #include <emmintrin.h> 139 #include "util/u_sse.h" 140 141 142 static inline void 143 build_masks_sse(int c, 144 int cdiff, 145 int dcdx, 146 int dcdy, 147 unsigned *outmask, 148 unsigned *partmask) 149 { 150 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 151 __m128i xdcdy = _mm_set1_epi32(dcdy); 152 153 /* Get values across the quad 154 */ 155 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 156 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 157 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 158 159 { 160 __m128i cstep01, cstep23, result; 161 162 cstep01 = _mm_packs_epi32(cstep0, cstep1); 163 cstep23 = _mm_packs_epi32(cstep2, cstep3); 164 result = _mm_packs_epi16(cstep01, cstep23); 165 166 *outmask |= _mm_movemask_epi8(result); 167 } 168 169 170 { 171 __m128i cio4 = _mm_set1_epi32(cdiff); 172 __m128i cstep01, cstep23, result; 173 174 cstep0 = _mm_add_epi32(cstep0, cio4); 175 cstep1 = _mm_add_epi32(cstep1, cio4); 176 cstep2 = _mm_add_epi32(cstep2, cio4); 177 cstep3 = _mm_add_epi32(cstep3, cio4); 178 179 cstep01 = _mm_packs_epi32(cstep0, cstep1); 180 cstep23 = _mm_packs_epi32(cstep2, cstep3); 181 result = _mm_packs_epi16(cstep01, cstep23); 182 183 *partmask |= _mm_movemask_epi8(result); 184 } 185 } 186 187 188 static inline unsigned 189 build_mask_linear_sse(int c, int dcdx, int dcdy) 190 { 191 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 192 __m128i xdcdy = _mm_set1_epi32(dcdy); 193 194 /* Get values across the quad 195 */ 196 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 197 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 198 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 199 200 /* pack pairs of results into epi16 201 */ 202 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 203 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 204 205 /* pack into epi8, preserving sign bits 206 */ 207 __m128i result = _mm_packs_epi16(cstep01, cstep23); 208 209 /* extract sign bits to create mask 210 */ 211 return _mm_movemask_epi8(result); 212 } 213 214 static inline unsigned 215 sign_bits4(const __m128i *cstep, int cdiff) 216 { 217 218 /* Adjust the step values 219 */ 220 __m128i cio4 = _mm_set1_epi32(cdiff); 221 __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); 222 __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); 223 __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); 224 __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); 225 226 /* Pack down to epi8 227 */ 228 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 229 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 230 __m128i result = _mm_packs_epi16(cstep01, cstep23); 231 232 /* Extract the sign bits 233 */ 234 return _mm_movemask_epi8(result); 235 } 236 237 238 #define NR_PLANES 3 239 240 void 241 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 242 const union lp_rast_cmd_arg arg) 243 { 244 const struct lp_rast_triangle *tri = arg.triangle.tri; 245 const struct lp_rast_plane *plane = GET_PLANES(tri); 246 int x = (arg.triangle.plane_mask & 0xff) + task->x; 247 int y = (arg.triangle.plane_mask >> 8) + task->y; 248 unsigned i, j; 249 250 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 251 unsigned nr = 0; 252 253 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 254 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 255 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 256 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 257 __m128i zero = _mm_setzero_si128(); 258 259 __m128i c, dcdx, dcdy, rej4; 260 __m128i dcdx_neg_mask, dcdy_neg_mask; 261 __m128i dcdx2, dcdx3; 262 263 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 264 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 265 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 266 __m128i unused; 267 268 transpose4_epi32(&p0, &p1, &p2, &zero, 269 &c, &unused, &dcdx, &dcdy); 270 271 /* recalc eo - easier than trying to load as scalars / shuffle... */ 272 dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); 273 dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); 274 rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), 275 _mm_and_si128(dcdx_neg_mask, dcdx)); 276 277 /* Adjust dcdx; 278 */ 279 dcdx = _mm_sub_epi32(zero, dcdx); 280 281 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 282 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 283 rej4 = _mm_slli_epi32(rej4, 2); 284 285 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 286 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 287 rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); 288 289 dcdx2 = _mm_add_epi32(dcdx, dcdx); 290 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 291 292 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 293 &span_0, &span_1, &span_2, &unused); 294 295 for (i = 0; i < 4; i++) { 296 __m128i cx = c; 297 298 for (j = 0; j < 4; j++) { 299 __m128i c4rej = _mm_add_epi32(cx, rej4); 300 __m128i rej_masks = _mm_srai_epi32(c4rej, 31); 301 302 /* if (is_zero(rej_masks)) */ 303 if (_mm_movemask_epi8(rej_masks) == 0) { 304 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); 305 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); 306 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); 307 308 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 309 310 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 311 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 312 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 313 314 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 315 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 316 317 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 318 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 319 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 320 321 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 322 323 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 324 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 325 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 326 327 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 328 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 329 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 330 331 unsigned mask = _mm_movemask_epi8(c_0123); 332 333 out[nr].i = i; 334 out[nr].j = j; 335 out[nr].mask = mask; 336 if (mask != 0xffff) 337 nr++; 338 } 339 cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); 340 } 341 342 c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); 343 } 344 345 for (i = 0; i < nr; i++) 346 lp_rast_shade_quads_mask(task, 347 &tri->inputs, 348 x + 4 * out[i].j, 349 y + 4 * out[i].i, 350 0xffff & ~out[i].mask); 351 } 352 353 void 354 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 355 const union lp_rast_cmd_arg arg) 356 { 357 const struct lp_rast_triangle *tri = arg.triangle.tri; 358 const struct lp_rast_plane *plane = GET_PLANES(tri); 359 unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; 360 unsigned y = (arg.triangle.plane_mask >> 8) + task->y; 361 362 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 363 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 364 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 365 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 366 __m128i zero = _mm_setzero_si128(); 367 368 __m128i c, dcdx, dcdy; 369 __m128i dcdx2, dcdx3; 370 371 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 372 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 373 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 374 __m128i unused; 375 376 transpose4_epi32(&p0, &p1, &p2, &zero, 377 &c, &unused, &dcdx, &dcdy); 378 379 /* Adjust dcdx; 380 */ 381 dcdx = _mm_sub_epi32(zero, dcdx); 382 383 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 384 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 385 386 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 387 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 388 389 dcdx2 = _mm_add_epi32(dcdx, dcdx); 390 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 391 392 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 393 &span_0, &span_1, &span_2, &unused); 394 395 396 { 397 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); 398 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); 399 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); 400 401 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 402 403 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 404 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 405 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 406 407 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 408 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 409 410 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 411 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 412 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 413 414 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 415 416 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 417 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 418 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 419 420 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 421 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 422 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 423 424 unsigned mask = _mm_movemask_epi8(c_0123); 425 426 if (mask != 0xffff) 427 lp_rast_shade_quads_mask(task, 428 &tri->inputs, 429 x, 430 y, 431 0xffff & ~mask); 432 } 433 } 434 435 #undef NR_PLANES 436 437 #else 438 439 #if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) 440 441 #include <altivec.h> 442 #include "util/u_pwr8.h" 443 444 static inline void 445 build_masks_ppc(int c, 446 int cdiff, 447 int dcdx, 448 int dcdy, 449 unsigned *outmask, 450 unsigned *partmask) 451 { 452 __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 453 __m128i xdcdy = (__m128i) vec_splats(dcdy); 454 455 /* Get values across the quad 456 */ 457 __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 458 __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 459 __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 460 461 { 462 __m128i cstep01, cstep23, result; 463 464 cstep01 = vec_packs_epi32(cstep0, cstep1); 465 cstep23 = vec_packs_epi32(cstep2, cstep3); 466 result = vec_packs_epi16(cstep01, cstep23); 467 468 *outmask |= vec_movemask_epi8(result); 469 } 470 471 472 { 473 __m128i cio4 = (__m128i) vec_splats(cdiff); 474 __m128i cstep01, cstep23, result; 475 476 cstep0 = vec_add_epi32(cstep0, cio4); 477 cstep1 = vec_add_epi32(cstep1, cio4); 478 cstep2 = vec_add_epi32(cstep2, cio4); 479 cstep3 = vec_add_epi32(cstep3, cio4); 480 481 cstep01 = vec_packs_epi32(cstep0, cstep1); 482 cstep23 = vec_packs_epi32(cstep2, cstep3); 483 result = vec_packs_epi16(cstep01, cstep23); 484 485 *partmask |= vec_movemask_epi8(result); 486 } 487 } 488 489 static inline unsigned 490 build_mask_linear_ppc(int c, int dcdx, int dcdy) 491 { 492 __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 493 __m128i xdcdy = (__m128i) vec_splats(dcdy); 494 495 /* Get values across the quad 496 */ 497 __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 498 __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 499 __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 500 501 /* pack pairs of results into epi16 502 */ 503 __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); 504 __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); 505 506 /* pack into epi8, preserving sign bits 507 */ 508 __m128i result = vec_packs_epi16(cstep01, cstep23); 509 510 /* extract sign bits to create mask 511 */ 512 return vec_movemask_epi8(result); 513 } 514 515 static inline __m128i 516 lp_plane_to_m128i(const struct lp_rast_plane *plane) 517 { 518 return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, 519 (int32_t)plane->dcdy, (int32_t)plane->eo); 520 } 521 522 #define NR_PLANES 3 523 524 void 525 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 526 const union lp_rast_cmd_arg arg) 527 { 528 const struct lp_rast_triangle *tri = arg.triangle.tri; 529 const struct lp_rast_plane *plane = GET_PLANES(tri); 530 int x = (arg.triangle.plane_mask & 0xff) + task->x; 531 int y = (arg.triangle.plane_mask >> 8) + task->y; 532 unsigned i, j; 533 534 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 535 unsigned nr = 0; 536 537 __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ 538 __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ 539 __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ 540 __m128i zero = vec_splats((unsigned char) 0); 541 542 __m128i c; 543 __m128i dcdx; 544 __m128i dcdy; 545 __m128i rej4; 546 547 __m128i dcdx2; 548 __m128i dcdx3; 549 550 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 551 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 552 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 553 __m128i unused; 554 555 __m128i vshuf_mask0; 556 __m128i vshuf_mask1; 557 __m128i vshuf_mask2; 558 559 #ifdef PIPE_ARCH_LITTLE_ENDIAN 560 vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); 561 vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); 562 vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); 563 #else 564 vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); 565 vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); 566 vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); 567 #endif 568 569 transpose4_epi32(&p0, &p1, &p2, &zero, 570 &c, &dcdx, &dcdy, &rej4); 571 572 /* Adjust dcdx; 573 */ 574 dcdx = vec_sub_epi32(zero, dcdx); 575 576 c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); 577 c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); 578 rej4 = vec_slli_epi32(rej4, 2); 579 580 /* 581 * Adjust so we can just check the sign bit (< 0 comparison), 582 * instead of having to do a less efficient <= 0 comparison 583 */ 584 c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); 585 rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); 586 587 dcdx2 = vec_add_epi32(dcdx, dcdx); 588 dcdx3 = vec_add_epi32(dcdx2, dcdx); 589 590 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 591 &span_0, &span_1, &span_2, &unused); 592 593 for (i = 0; i < 4; i++) { 594 __m128i cx = c; 595 596 for (j = 0; j < 4; j++) { 597 __m128i c4rej = vec_add_epi32(cx, rej4); 598 __m128i rej_masks = vec_srai_epi32(c4rej, 31); 599 600 /* if (is_zero(rej_masks)) */ 601 if (vec_movemask_epi8(rej_masks) == 0) { 602 __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); 603 __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); 604 __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); 605 606 __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); 607 608 __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); 609 __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); 610 __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); 611 612 __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); 613 __m128i c_01 = vec_packs_epi32(c_0, c_1); 614 615 __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); 616 __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); 617 __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); 618 619 __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2); 620 621 __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0)); 622 __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1)); 623 __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2)); 624 625 __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3); 626 __m128i c_23 = vec_packs_epi32(c_2, c_3); 627 __m128i c_0123 = vec_packs_epi16(c_01, c_23); 628 629 unsigned mask = vec_movemask_epi8(c_0123); 630 631 out[nr].i = i; 632 out[nr].j = j; 633 out[nr].mask = mask; 634 if (mask != 0xffff) 635 nr++; 636 } 637 cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2)); 638 } 639 640 c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2)); 641 } 642 643 for (i = 0; i < nr; i++) 644 lp_rast_shade_quads_mask(task, 645 &tri->inputs, 646 x + 4 * out[i].j, 647 y + 4 * out[i].i, 648 0xffff & ~out[i].mask); 649 } 650 651 #undef NR_PLANES 652 653 #else 654 655 void 656 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 657 const union lp_rast_cmd_arg arg) 658 { 659 union lp_rast_cmd_arg arg2; 660 arg2.triangle.tri = arg.triangle.tri; 661 arg2.triangle.plane_mask = (1<<3)-1; 662 lp_rast_triangle_32_3(task, arg2); 663 } 664 665 #endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ 666 667 void 668 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, 669 const union lp_rast_cmd_arg arg) 670 { 671 union lp_rast_cmd_arg arg2; 672 arg2.triangle.tri = arg.triangle.tri; 673 arg2.triangle.plane_mask = (1<<4)-1; 674 lp_rast_triangle_32_4(task, arg2); 675 } 676 677 void 678 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 679 const union lp_rast_cmd_arg arg) 680 { 681 lp_rast_triangle_32_3_16(task, arg); 682 } 683 684 #endif 685 686 687 #if defined PIPE_ARCH_SSE 688 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 689 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) 690 #elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)) 691 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 692 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) 693 #else 694 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) 695 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) 696 #endif 697 698 #define RASTER_64 1 699 700 #define TAG(x) x##_1 701 #define NR_PLANES 1 702 #include "lp_rast_tri_tmp.h" 703 704 #define TAG(x) x##_2 705 #define NR_PLANES 2 706 #include "lp_rast_tri_tmp.h" 707 708 #define TAG(x) x##_3 709 #define NR_PLANES 3 710 /*#define TRI_4 lp_rast_triangle_3_4*/ 711 /*#define TRI_16 lp_rast_triangle_3_16*/ 712 #include "lp_rast_tri_tmp.h" 713 714 #define TAG(x) x##_4 715 #define NR_PLANES 4 716 /*#define TRI_16 lp_rast_triangle_4_16*/ 717 #include "lp_rast_tri_tmp.h" 718 719 #define TAG(x) x##_5 720 #define NR_PLANES 5 721 #include "lp_rast_tri_tmp.h" 722 723 #define TAG(x) x##_6 724 #define NR_PLANES 6 725 #include "lp_rast_tri_tmp.h" 726 727 #define TAG(x) x##_7 728 #define NR_PLANES 7 729 #include "lp_rast_tri_tmp.h" 730 731 #define TAG(x) x##_8 732 #define NR_PLANES 8 733 #include "lp_rast_tri_tmp.h" 734 735 #undef RASTER_64 736 737 #define TAG(x) x##_32_1 738 #define NR_PLANES 1 739 #include "lp_rast_tri_tmp.h" 740 741 #define TAG(x) x##_32_2 742 #define NR_PLANES 2 743 #include "lp_rast_tri_tmp.h" 744 745 #define TAG(x) x##_32_3 746 #define NR_PLANES 3 747 /*#define TRI_4 lp_rast_triangle_3_4*/ 748 /*#define TRI_16 lp_rast_triangle_3_16*/ 749 #include "lp_rast_tri_tmp.h" 750 751 #define TAG(x) x##_32_4 752 #define NR_PLANES 4 753 #ifdef PIPE_ARCH_SSE 754 #define TRI_16 lp_rast_triangle_32_4_16 755 #endif 756 #include "lp_rast_tri_tmp.h" 757 758 #define TAG(x) x##_32_5 759 #define NR_PLANES 5 760 #include "lp_rast_tri_tmp.h" 761 762 #define TAG(x) x##_32_6 763 #define NR_PLANES 6 764 #include "lp_rast_tri_tmp.h" 765 766 #define TAG(x) x##_32_7 767 #define NR_PLANES 7 768 #include "lp_rast_tri_tmp.h" 769 770 #define TAG(x) x##_32_8 771 #define NR_PLANES 8 772 #include "lp_rast_tri_tmp.h" 773 774