1 /************************************************************************** 2 * 3 * Copyright 2007 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 /* 29 * Binning code for triangles 30 */ 31 32 #include "util/u_math.h" 33 #include "util/u_memory.h" 34 #include "util/u_rect.h" 35 #include "util/u_sse.h" 36 #include "lp_perf.h" 37 #include "lp_setup_context.h" 38 #include "lp_rast.h" 39 #include "lp_state_fs.h" 40 #include "lp_state_setup.h" 41 #include "lp_context.h" 42 43 #include <inttypes.h> 44 45 #define NUM_CHANNELS 4 46 47 #if defined(PIPE_ARCH_SSE) 48 #include <emmintrin.h> 49 #elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) 50 #include <altivec.h> 51 #include "util/u_pwr8.h" 52 #endif 53 54 static inline int 55 subpixel_snap(float a) 56 { 57 return util_iround(FIXED_ONE * a); 58 } 59 60 static inline float 61 fixed_to_float(int a) 62 { 63 return a * (1.0f / FIXED_ONE); 64 } 65 66 67 /* Position and area in fixed point coordinates */ 68 struct fixed_position { 69 int32_t x[4]; 70 int32_t y[4]; 71 int32_t dx01; 72 int32_t dy01; 73 int32_t dx20; 74 int32_t dy20; 75 int64_t area; 76 }; 77 78 79 /** 80 * Alloc space for a new triangle plus the input.a0/dadx/dady arrays 81 * immediately after it. 82 * The memory is allocated from the per-scene pool, not per-tile. 83 * \param tri_size returns number of bytes allocated 84 * \param num_inputs number of fragment shader inputs 85 * \return pointer to triangle space 86 */ 87 struct lp_rast_triangle * 88 lp_setup_alloc_triangle(struct lp_scene *scene, 89 unsigned nr_inputs, 90 unsigned nr_planes, 91 unsigned *tri_size) 92 { 93 unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); 94 unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane); 95 struct lp_rast_triangle *tri; 96 97 STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0); 98 99 *tri_size = (sizeof(struct lp_rast_triangle) + 100 3 * input_array_sz + 101 plane_sz); 102 103 tri = lp_scene_alloc_aligned( scene, *tri_size, 16 ); 104 if (!tri) 105 return NULL; 106 107 tri->inputs.stride = input_array_sz; 108 109 { 110 char *a = (char *)tri; 111 char *b = (char *)&GET_PLANES(tri)[nr_planes]; 112 assert(b - a == *tri_size); 113 } 114 115 return tri; 116 } 117 118 void 119 lp_setup_print_vertex(struct lp_setup_context *setup, 120 const char *name, 121 const float (*v)[4]) 122 { 123 const struct lp_setup_variant_key *key = &setup->setup.variant->key; 124 int i, j; 125 126 debug_printf(" wpos (%s[0]) xyzw %f %f %f %f\n", 127 name, 128 v[0][0], v[0][1], v[0][2], v[0][3]); 129 130 for (i = 0; i < key->num_inputs; i++) { 131 const float *in = v[key->inputs[i].src_index]; 132 133 debug_printf(" in[%d] (%s[%d]) %s%s%s%s ", 134 i, 135 name, key->inputs[i].src_index, 136 (key->inputs[i].usage_mask & 0x1) ? "x" : " ", 137 (key->inputs[i].usage_mask & 0x2) ? "y" : " ", 138 (key->inputs[i].usage_mask & 0x4) ? "z" : " ", 139 (key->inputs[i].usage_mask & 0x8) ? "w" : " "); 140 141 for (j = 0; j < 4; j++) 142 if (key->inputs[i].usage_mask & (1<<j)) 143 debug_printf("%.5f ", in[j]); 144 145 debug_printf("\n"); 146 } 147 } 148 149 150 /** 151 * Print triangle vertex attribs (for debug). 152 */ 153 void 154 lp_setup_print_triangle(struct lp_setup_context *setup, 155 const float (*v0)[4], 156 const float (*v1)[4], 157 const float (*v2)[4]) 158 { 159 debug_printf("triangle\n"); 160 161 { 162 const float ex = v0[0][0] - v2[0][0]; 163 const float ey = v0[0][1] - v2[0][1]; 164 const float fx = v1[0][0] - v2[0][0]; 165 const float fy = v1[0][1] - v2[0][1]; 166 167 /* det = cross(e,f).z */ 168 const float det = ex * fy - ey * fx; 169 if (det < 0.0f) 170 debug_printf(" - ccw\n"); 171 else if (det > 0.0f) 172 debug_printf(" - cw\n"); 173 else 174 debug_printf(" - zero area\n"); 175 } 176 177 lp_setup_print_vertex(setup, "v0", v0); 178 lp_setup_print_vertex(setup, "v1", v1); 179 lp_setup_print_vertex(setup, "v2", v2); 180 } 181 182 183 #define MAX_PLANES 8 184 static unsigned 185 lp_rast_tri_tab[MAX_PLANES+1] = { 186 0, /* should be impossible */ 187 LP_RAST_OP_TRIANGLE_1, 188 LP_RAST_OP_TRIANGLE_2, 189 LP_RAST_OP_TRIANGLE_3, 190 LP_RAST_OP_TRIANGLE_4, 191 LP_RAST_OP_TRIANGLE_5, 192 LP_RAST_OP_TRIANGLE_6, 193 LP_RAST_OP_TRIANGLE_7, 194 LP_RAST_OP_TRIANGLE_8 195 }; 196 197 static unsigned 198 lp_rast_32_tri_tab[MAX_PLANES+1] = { 199 0, /* should be impossible */ 200 LP_RAST_OP_TRIANGLE_32_1, 201 LP_RAST_OP_TRIANGLE_32_2, 202 LP_RAST_OP_TRIANGLE_32_3, 203 LP_RAST_OP_TRIANGLE_32_4, 204 LP_RAST_OP_TRIANGLE_32_5, 205 LP_RAST_OP_TRIANGLE_32_6, 206 LP_RAST_OP_TRIANGLE_32_7, 207 LP_RAST_OP_TRIANGLE_32_8 208 }; 209 210 211 212 /** 213 * The primitive covers the whole tile- shade whole tile. 214 * 215 * \param tx, ty the tile position in tiles, not pixels 216 */ 217 static boolean 218 lp_setup_whole_tile(struct lp_setup_context *setup, 219 const struct lp_rast_shader_inputs *inputs, 220 int tx, int ty) 221 { 222 struct lp_scene *scene = setup->scene; 223 224 LP_COUNT(nr_fully_covered_64); 225 226 /* if variant is opaque and scissor doesn't effect the tile */ 227 if (inputs->opaque) { 228 /* Several things prevent this optimization from working: 229 * - For layered rendering we can't determine if this covers the same layer 230 * as previous rendering (or in case of clears those actually always cover 231 * all layers so optimization is impossible). Need to use fb_max_layer and 232 * not setup->layer_slot to determine this since even if there's currently 233 * no slot assigned previous rendering could have used one. 234 * - If there were any Begin/End query commands in the scene then those 235 * would get removed which would be very wrong. Furthermore, if queries 236 * were just active we also can't do the optimization since to get 237 * accurate query results we unfortunately need to execute the rendering 238 * commands. 239 */ 240 if (!scene->fb.zsbuf && scene->fb_max_layer == 0 && !scene->had_queries) { 241 /* 242 * All previous rendering will be overwritten so reset the bin. 243 */ 244 lp_scene_bin_reset( scene, tx, ty ); 245 } 246 247 LP_COUNT(nr_shade_opaque_64); 248 return lp_scene_bin_cmd_with_state( scene, tx, ty, 249 setup->fs.stored, 250 LP_RAST_OP_SHADE_TILE_OPAQUE, 251 lp_rast_arg_inputs(inputs) ); 252 } else { 253 LP_COUNT(nr_shade_64); 254 return lp_scene_bin_cmd_with_state( scene, tx, ty, 255 setup->fs.stored, 256 LP_RAST_OP_SHADE_TILE, 257 lp_rast_arg_inputs(inputs) ); 258 } 259 } 260 261 262 /** 263 * Do basic setup for triangle rasterization and determine which 264 * framebuffer tiles are touched. Put the triangle in the scene's 265 * bins for the tiles which we overlap. 266 */ 267 static boolean 268 do_triangle_ccw(struct lp_setup_context *setup, 269 struct fixed_position* position, 270 const float (*v0)[4], 271 const float (*v1)[4], 272 const float (*v2)[4], 273 boolean frontfacing ) 274 { 275 struct lp_scene *scene = setup->scene; 276 const struct lp_setup_variant_key *key = &setup->setup.variant->key; 277 struct lp_rast_triangle *tri; 278 struct lp_rast_plane *plane; 279 struct u_rect bbox; 280 unsigned tri_bytes; 281 int nr_planes = 3; 282 unsigned viewport_index = 0; 283 unsigned layer = 0; 284 const float (*pv)[4]; 285 286 /* Area should always be positive here */ 287 assert(position->area > 0); 288 289 if (0) 290 lp_setup_print_triangle(setup, v0, v1, v2); 291 292 if (setup->flatshade_first) { 293 pv = v0; 294 } 295 else { 296 pv = v2; 297 } 298 if (setup->viewport_index_slot > 0) { 299 unsigned *udata = (unsigned*)pv[setup->viewport_index_slot]; 300 viewport_index = lp_clamp_viewport_idx(*udata); 301 } 302 if (setup->layer_slot > 0) { 303 layer = *(unsigned*)pv[setup->layer_slot]; 304 layer = MIN2(layer, scene->fb_max_layer); 305 } 306 307 /* Bounding rectangle (in pixels) */ 308 { 309 /* Yes this is necessary to accurately calculate bounding boxes 310 * with the two fill-conventions we support. GL (normally) ends 311 * up needing a bottom-left fill convention, which requires 312 * slightly different rounding. 313 */ 314 int adj = (setup->bottom_edge_rule != 0) ? 1 : 0; 315 316 /* Inclusive x0, exclusive x1 */ 317 bbox.x0 = MIN3(position->x[0], position->x[1], position->x[2]) >> FIXED_ORDER; 318 bbox.x1 = (MAX3(position->x[0], position->x[1], position->x[2]) - 1) >> FIXED_ORDER; 319 320 /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */ 321 bbox.y0 = (MIN3(position->y[0], position->y[1], position->y[2]) + adj) >> FIXED_ORDER; 322 bbox.y1 = (MAX3(position->y[0], position->y[1], position->y[2]) - 1 + adj) >> FIXED_ORDER; 323 } 324 325 if (bbox.x1 < bbox.x0 || 326 bbox.y1 < bbox.y0) { 327 if (0) debug_printf("empty bounding box\n"); 328 LP_COUNT(nr_culled_tris); 329 return TRUE; 330 } 331 332 if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) { 333 if (0) debug_printf("offscreen\n"); 334 LP_COUNT(nr_culled_tris); 335 return TRUE; 336 } 337 338 /* Can safely discard negative regions, but need to keep hold of 339 * information about when the triangle extends past screen 340 * boundaries. See trimmed_box in lp_setup_bin_triangle(). 341 */ 342 bbox.x0 = MAX2(bbox.x0, 0); 343 bbox.y0 = MAX2(bbox.y0, 0); 344 345 nr_planes = 3; 346 /* 347 * Determine how many scissor planes we need, that is drop scissor 348 * edges if the bounding box of the tri is fully inside that edge. 349 */ 350 if (setup->scissor_test) { 351 /* why not just use draw_regions */ 352 boolean s_planes[4]; 353 scissor_planes_needed(s_planes, &bbox, &setup->scissors[viewport_index]); 354 nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3]; 355 } 356 357 tri = lp_setup_alloc_triangle(scene, 358 key->num_inputs, 359 nr_planes, 360 &tri_bytes); 361 if (!tri) 362 return FALSE; 363 364 #if 0 365 tri->v[0][0] = v0[0][0]; 366 tri->v[1][0] = v1[0][0]; 367 tri->v[2][0] = v2[0][0]; 368 tri->v[0][1] = v0[0][1]; 369 tri->v[1][1] = v1[0][1]; 370 tri->v[2][1] = v2[0][1]; 371 #endif 372 373 LP_COUNT(nr_tris); 374 375 /* Setup parameter interpolants: 376 */ 377 setup->setup.variant->jit_function(v0, v1, v2, 378 frontfacing, 379 GET_A0(&tri->inputs), 380 GET_DADX(&tri->inputs), 381 GET_DADY(&tri->inputs)); 382 383 tri->inputs.frontfacing = frontfacing; 384 tri->inputs.disable = FALSE; 385 tri->inputs.opaque = setup->fs.current.variant->opaque; 386 tri->inputs.layer = layer; 387 tri->inputs.viewport_index = viewport_index; 388 389 if (0) 390 lp_dump_setup_coef(&setup->setup.variant->key, 391 (const float (*)[4])GET_A0(&tri->inputs), 392 (const float (*)[4])GET_DADX(&tri->inputs), 393 (const float (*)[4])GET_DADY(&tri->inputs)); 394 395 plane = GET_PLANES(tri); 396 397 #if defined(PIPE_ARCH_SSE) 398 if (1) { 399 __m128i vertx, verty; 400 __m128i shufx, shufy; 401 __m128i dcdx, dcdy; 402 __m128i cdx02, cdx13, cdy02, cdy13, c02, c13; 403 __m128i c01, c23, unused; 404 __m128i dcdx_neg_mask; 405 __m128i dcdy_neg_mask; 406 __m128i dcdx_zero_mask; 407 __m128i top_left_flag, c_dec; 408 __m128i eo, p0, p1, p2; 409 __m128i zero = _mm_setzero_si128(); 410 411 vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */ 412 verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */ 413 414 shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1)); 415 shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1)); 416 417 dcdx = _mm_sub_epi32(verty, shufy); 418 dcdy = _mm_sub_epi32(vertx, shufx); 419 420 dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); 421 dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero); 422 dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); 423 424 top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0); 425 426 c_dec = _mm_or_si128(dcdx_neg_mask, 427 _mm_and_si128(dcdx_zero_mask, 428 _mm_xor_si128(dcdy_neg_mask, 429 top_left_flag))); 430 431 /* 432 * 64 bit arithmetic. 433 * Note we need _signed_ mul (_mm_mul_epi32) which we emulate. 434 */ 435 cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13); 436 cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13); 437 c02 = _mm_sub_epi64(cdx02, cdy02); 438 c13 = _mm_sub_epi64(cdx13, cdy13); 439 c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec, 440 _MM_SHUFFLE(2,2,0,0))); 441 c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec, 442 _MM_SHUFFLE(3,3,1,1))); 443 444 /* 445 * Useful for very small fbs/tris (or fewer subpixel bits) only: 446 * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx), 447 * mm_mullo_epi32(dcdy, verty)); 448 * 449 * c = _mm_sub_epi32(c, c_dec); 450 */ 451 452 /* Scale up to match c: 453 */ 454 dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER); 455 dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER); 456 457 /* 458 * Calculate trivial reject values: 459 * Note eo cannot overflow even if dcdx/dcdy would already have 460 * 31 bits (which they shouldn't have). This is because eo 461 * is never negative (albeit if we rely on that need to be careful...) 462 */ 463 eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), 464 _mm_and_si128(dcdx_neg_mask, dcdx)); 465 466 /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ 467 468 /* 469 * Pointless transpose which gets undone immediately in 470 * rasterization. 471 * It is actually difficult to do away with it - would essentially 472 * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations 473 * for this then would need to depend on the number of planes. 474 * The transpose is quite special here due to c being 64bit... 475 * The store has to be unaligned (unless we'd make the plane size 476 * a multiple of 128), and of course storing eo separately... 477 */ 478 c01 = _mm_unpacklo_epi64(c02, c13); 479 c23 = _mm_unpackhi_epi64(c02, c13); 480 transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy, 481 &p0, &p1, &p2, &unused); 482 _mm_storeu_si128((__m128i *)&plane[0], p0); 483 plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo); 484 _mm_storeu_si128((__m128i *)&plane[1], p1); 485 eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1)); 486 plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo); 487 _mm_storeu_si128((__m128i *)&plane[2], p2); 488 eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2)); 489 plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo); 490 } else 491 #elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) 492 /* 493 * XXX this code is effectively disabled for all practical purposes, 494 * as the allowed fb size is tiny if FIXED_ORDER is 8. 495 */ 496 if (setup->fb.width <= MAX_FIXED_LENGTH32 && 497 setup->fb.height <= MAX_FIXED_LENGTH32 && 498 (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && 499 (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { 500 unsigned int bottom_edge; 501 __m128i vertx, verty; 502 __m128i shufx, shufy; 503 __m128i dcdx, dcdy, c; 504 __m128i unused; 505 __m128i dcdx_neg_mask; 506 __m128i dcdy_neg_mask; 507 __m128i dcdx_zero_mask; 508 __m128i top_left_flag; 509 __m128i c_inc_mask, c_inc; 510 __m128i eo, p0, p1, p2; 511 __m128i_union vshuf_mask; 512 __m128i zero = vec_splats((unsigned char) 0); 513 PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; 514 515 #ifdef PIPE_ARCH_LITTLE_ENDIAN 516 vshuf_mask.i[0] = 0x07060504; 517 vshuf_mask.i[1] = 0x0B0A0908; 518 vshuf_mask.i[2] = 0x03020100; 519 vshuf_mask.i[3] = 0x0F0E0D0C; 520 #else 521 vshuf_mask.i[0] = 0x00010203; 522 vshuf_mask.i[1] = 0x0C0D0E0F; 523 vshuf_mask.i[2] = 0x04050607; 524 vshuf_mask.i[3] = 0x08090A0B; 525 #endif 526 527 /* vertex x coords */ 528 vertx = vec_load_si128((const uint32_t *) position->x); 529 /* vertex y coords */ 530 verty = vec_load_si128((const uint32_t *) position->y); 531 532 shufx = vec_perm (vertx, vertx, vshuf_mask.m128i); 533 shufy = vec_perm (verty, verty, vshuf_mask.m128i); 534 535 dcdx = vec_sub_epi32(verty, shufy); 536 dcdy = vec_sub_epi32(vertx, shufx); 537 538 dcdx_neg_mask = vec_srai_epi32(dcdx, 31); 539 dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero); 540 dcdy_neg_mask = vec_srai_epi32(dcdy, 31); 541 542 bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0; 543 top_left_flag = (__m128i) vec_splats(bottom_edge); 544 545 c_inc_mask = vec_or(dcdx_neg_mask, 546 vec_and(dcdx_zero_mask, 547 vec_xor(dcdy_neg_mask, 548 top_left_flag))); 549 550 c_inc = vec_srli_epi32(c_inc_mask, 31); 551 552 c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx), 553 vec_mullo_epi32(dcdy, verty)); 554 555 c = vec_add_epi32(c, c_inc); 556 557 /* Scale up to match c: 558 */ 559 dcdx = vec_slli_epi32(dcdx, FIXED_ORDER); 560 dcdy = vec_slli_epi32(dcdy, FIXED_ORDER); 561 562 /* Calculate trivial reject values: 563 */ 564 eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy), 565 vec_and(dcdx_neg_mask, dcdx)); 566 567 /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ 568 569 /* Pointless transpose which gets undone immediately in 570 * rasterization: 571 */ 572 transpose4_epi32(&c, &dcdx, &dcdy, &eo, 573 &p0, &p1, &p2, &unused); 574 575 #define STORE_PLANE(plane, vec) do { \ 576 vec_store_si128((uint32_t *)&temp_vec, vec); \ 577 plane.c = (int64_t)temp_vec[0]; \ 578 plane.dcdx = temp_vec[1]; \ 579 plane.dcdy = temp_vec[2]; \ 580 plane.eo = temp_vec[3]; \ 581 } while(0) 582 583 STORE_PLANE(plane[0], p0); 584 STORE_PLANE(plane[1], p1); 585 STORE_PLANE(plane[2], p2); 586 #undef STORE_PLANE 587 } else 588 #endif 589 { 590 int i; 591 plane[0].dcdy = position->dx01; 592 plane[1].dcdy = position->x[1] - position->x[2]; 593 plane[2].dcdy = position->dx20; 594 plane[0].dcdx = position->dy01; 595 plane[1].dcdx = position->y[1] - position->y[2]; 596 plane[2].dcdx = position->dy20; 597 598 for (i = 0; i < 3; i++) { 599 /* half-edge constants, will be iterated over the whole render 600 * target. 601 */ 602 plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) - 603 IMUL64(plane[i].dcdy, position->y[i]); 604 605 /* correct for top-left vs. bottom-left fill convention. 606 */ 607 if (plane[i].dcdx < 0) { 608 /* both fill conventions want this - adjust for left edges */ 609 plane[i].c++; 610 } 611 else if (plane[i].dcdx == 0) { 612 if (setup->bottom_edge_rule == 0){ 613 /* correct for top-left fill convention: 614 */ 615 if (plane[i].dcdy > 0) plane[i].c++; 616 } 617 else { 618 /* correct for bottom-left fill convention: 619 */ 620 if (plane[i].dcdy < 0) plane[i].c++; 621 } 622 } 623 624 /* Scale up to match c: 625 */ 626 assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx); 627 assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy); 628 plane[i].dcdx <<= FIXED_ORDER; 629 plane[i].dcdy <<= FIXED_ORDER; 630 631 /* find trivial reject offsets for each edge for a single-pixel 632 * sized block. These will be scaled up at each recursive level to 633 * match the active blocksize. Scaling in this way works best if 634 * the blocks are square. 635 */ 636 plane[i].eo = 0; 637 if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx; 638 if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy; 639 } 640 } 641 642 if (0) { 643 debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n", 644 plane[0].c, 645 plane[0].dcdx, 646 plane[0].dcdy, 647 plane[0].eo); 648 649 debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n", 650 plane[1].c, 651 plane[1].dcdx, 652 plane[1].dcdy, 653 plane[1].eo); 654 655 debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n", 656 plane[2].c, 657 plane[2].dcdx, 658 plane[2].dcdy, 659 plane[2].eo); 660 } 661 662 663 /* 664 * When rasterizing scissored tris, use the intersection of the 665 * triangle bounding box and the scissor rect to generate the 666 * scissor planes. 667 * 668 * This permits us to cut off the triangle "tails" that are present 669 * in the intermediate recursive levels caused when two of the 670 * triangles edges don't diverge quickly enough to trivially reject 671 * exterior blocks from the triangle. 672 * 673 * It's not really clear if it's worth worrying about these tails, 674 * but since we generate the planes for each scissored tri, it's 675 * free to trim them in this case. 676 * 677 * Note that otherwise, the scissor planes only vary in 'C' value, 678 * and even then only on state-changes. Could alternatively store 679 * these planes elsewhere. 680 * (Or only store the c value together with a bit indicating which 681 * scissor edge this is, so rasterization would treat them differently 682 * (easier to evaluate) to ordinary planes.) 683 */ 684 if (nr_planes > 3) { 685 /* why not just use draw_regions */ 686 const struct u_rect *scissor = &setup->scissors[viewport_index]; 687 struct lp_rast_plane *plane_s = &plane[3]; 688 boolean s_planes[4]; 689 scissor_planes_needed(s_planes, &bbox, scissor); 690 691 if (s_planes[0]) { 692 plane_s->dcdx = -1 << 8; 693 plane_s->dcdy = 0; 694 plane_s->c = (1-scissor->x0) << 8; 695 plane_s->eo = 1 << 8; 696 plane_s++; 697 } 698 if (s_planes[1]) { 699 plane_s->dcdx = 1 << 8; 700 plane_s->dcdy = 0; 701 plane_s->c = (scissor->x1+1) << 8; 702 plane_s->eo = 0 << 8; 703 plane_s++; 704 } 705 if (s_planes[2]) { 706 plane_s->dcdx = 0; 707 plane_s->dcdy = 1 << 8; 708 plane_s->c = (1-scissor->y0) << 8; 709 plane_s->eo = 1 << 8; 710 plane_s++; 711 } 712 if (s_planes[3]) { 713 plane_s->dcdx = 0; 714 plane_s->dcdy = -1 << 8; 715 plane_s->c = (scissor->y1+1) << 8; 716 plane_s->eo = 0; 717 plane_s++; 718 } 719 assert(plane_s == &plane[nr_planes]); 720 } 721 722 return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index); 723 } 724 725 /* 726 * Round to nearest less or equal power of two of the input. 727 * 728 * Undefined if no bit set exists, so code should check against 0 first. 729 */ 730 static inline uint32_t 731 floor_pot(uint32_t n) 732 { 733 #if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) 734 if (n == 0) 735 return 0; 736 737 __asm__("bsr %1,%0" 738 : "=r" (n) 739 : "rm" (n)); 740 return 1 << n; 741 #else 742 n |= (n >> 1); 743 n |= (n >> 2); 744 n |= (n >> 4); 745 n |= (n >> 8); 746 n |= (n >> 16); 747 return n - (n >> 1); 748 #endif 749 } 750 751 752 boolean 753 lp_setup_bin_triangle( struct lp_setup_context *setup, 754 struct lp_rast_triangle *tri, 755 const struct u_rect *bbox, 756 int nr_planes, 757 unsigned viewport_index ) 758 { 759 struct lp_scene *scene = setup->scene; 760 struct u_rect trimmed_box = *bbox; 761 int i; 762 /* What is the largest power-of-two boundary this triangle crosses: 763 */ 764 int dx = floor_pot((bbox->x0 ^ bbox->x1) | 765 (bbox->y0 ^ bbox->y1)); 766 767 /* The largest dimension of the rasterized area of the triangle 768 * (aligned to a 4x4 grid), rounded down to the nearest power of two: 769 */ 770 int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) | 771 (bbox->y1 - (bbox->y0 & ~3))); 772 int sz = floor_pot(max_sz); 773 boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32; 774 775 /* Now apply scissor, etc to the bounding box. Could do this 776 * earlier, but it confuses the logic for tri-16 and would force 777 * the rasterizer to also respect scissor, etc, just for the rare 778 * cases where a small triangle extends beyond the scissor. 779 */ 780 u_rect_find_intersection(&setup->draw_regions[viewport_index], 781 &trimmed_box); 782 783 /* Determine which tile(s) intersect the triangle's bounding box 784 */ 785 if (dx < TILE_SIZE) 786 { 787 int ix0 = bbox->x0 / TILE_SIZE; 788 int iy0 = bbox->y0 / TILE_SIZE; 789 unsigned px = bbox->x0 & 63 & ~3; 790 unsigned py = bbox->y0 & 63 & ~3; 791 792 assert(iy0 == bbox->y1 / TILE_SIZE && 793 ix0 == bbox->x1 / TILE_SIZE); 794 795 if (nr_planes == 3) { 796 if (sz < 4) 797 { 798 /* Triangle is contained in a single 4x4 stamp: 799 */ 800 assert(px + 4 <= TILE_SIZE); 801 assert(py + 4 <= TILE_SIZE); 802 return lp_scene_bin_cmd_with_state( scene, ix0, iy0, 803 setup->fs.stored, 804 use_32bits ? 805 LP_RAST_OP_TRIANGLE_32_3_4 : 806 LP_RAST_OP_TRIANGLE_3_4, 807 lp_rast_arg_triangle_contained(tri, px, py) ); 808 } 809 810 if (sz < 16) 811 { 812 /* Triangle is contained in a single 16x16 block: 813 */ 814 815 /* 816 * The 16x16 block is only 4x4 aligned, and can exceed the tile 817 * dimensions if the triangle is 16 pixels in one dimension but 4 818 * in the other. So budge the 16x16 back inside the tile. 819 */ 820 px = MIN2(px, TILE_SIZE - 16); 821 py = MIN2(py, TILE_SIZE - 16); 822 823 assert(px + 16 <= TILE_SIZE); 824 assert(py + 16 <= TILE_SIZE); 825 826 return lp_scene_bin_cmd_with_state( scene, ix0, iy0, 827 setup->fs.stored, 828 use_32bits ? 829 LP_RAST_OP_TRIANGLE_32_3_16 : 830 LP_RAST_OP_TRIANGLE_3_16, 831 lp_rast_arg_triangle_contained(tri, px, py) ); 832 } 833 } 834 else if (nr_planes == 4 && sz < 16) 835 { 836 px = MIN2(px, TILE_SIZE - 16); 837 py = MIN2(py, TILE_SIZE - 16); 838 839 assert(px + 16 <= TILE_SIZE); 840 assert(py + 16 <= TILE_SIZE); 841 842 return lp_scene_bin_cmd_with_state(scene, ix0, iy0, 843 setup->fs.stored, 844 use_32bits ? 845 LP_RAST_OP_TRIANGLE_32_4_16 : 846 LP_RAST_OP_TRIANGLE_4_16, 847 lp_rast_arg_triangle_contained(tri, px, py)); 848 } 849 850 851 /* Triangle is contained in a single tile: 852 */ 853 return lp_scene_bin_cmd_with_state( 854 scene, ix0, iy0, setup->fs.stored, 855 use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes], 856 lp_rast_arg_triangle(tri, (1<<nr_planes)-1)); 857 } 858 else 859 { 860 struct lp_rast_plane *plane = GET_PLANES(tri); 861 int64_t c[MAX_PLANES]; 862 int64_t ei[MAX_PLANES]; 863 864 int64_t eo[MAX_PLANES]; 865 int64_t xstep[MAX_PLANES]; 866 int64_t ystep[MAX_PLANES]; 867 int x, y; 868 869 int ix0 = trimmed_box.x0 / TILE_SIZE; 870 int iy0 = trimmed_box.y0 / TILE_SIZE; 871 int ix1 = trimmed_box.x1 / TILE_SIZE; 872 int iy1 = trimmed_box.y1 / TILE_SIZE; 873 874 for (i = 0; i < nr_planes; i++) { 875 c[i] = (plane[i].c + 876 IMUL64(plane[i].dcdy, iy0) * TILE_SIZE - 877 IMUL64(plane[i].dcdx, ix0) * TILE_SIZE); 878 879 ei[i] = (plane[i].dcdy - 880 plane[i].dcdx - 881 (int64_t)plane[i].eo) << TILE_ORDER; 882 883 eo[i] = (int64_t)plane[i].eo << TILE_ORDER; 884 xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER); 885 ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER; 886 } 887 888 889 890 /* Test tile-sized blocks against the triangle. 891 * Discard blocks fully outside the tri. If the block is fully 892 * contained inside the tri, bin an lp_rast_shade_tile command. 893 * Else, bin a lp_rast_triangle command. 894 */ 895 for (y = iy0; y <= iy1; y++) 896 { 897 boolean in = FALSE; /* are we inside the triangle? */ 898 int64_t cx[MAX_PLANES]; 899 900 for (i = 0; i < nr_planes; i++) 901 cx[i] = c[i]; 902 903 for (x = ix0; x <= ix1; x++) 904 { 905 int out = 0; 906 int partial = 0; 907 908 for (i = 0; i < nr_planes; i++) { 909 int64_t planeout = cx[i] + eo[i]; 910 int64_t planepartial = cx[i] + ei[i] - 1; 911 out |= (int) (planeout >> 63); 912 partial |= ((int) (planepartial >> 63)) & (1<<i); 913 } 914 915 if (out) { 916 /* do nothing */ 917 if (in) 918 break; /* exiting triangle, all done with this row */ 919 LP_COUNT(nr_empty_64); 920 } 921 else if (partial) { 922 /* Not trivially accepted by at least one plane - 923 * rasterize/shade partial tile 924 */ 925 int count = util_bitcount(partial); 926 in = TRUE; 927 928 if (!lp_scene_bin_cmd_with_state( scene, x, y, 929 setup->fs.stored, 930 use_32bits ? 931 lp_rast_32_tri_tab[count] : 932 lp_rast_tri_tab[count], 933 lp_rast_arg_triangle(tri, partial) )) 934 goto fail; 935 936 LP_COUNT(nr_partially_covered_64); 937 } 938 else { 939 /* triangle covers the whole tile- shade whole tile */ 940 LP_COUNT(nr_fully_covered_64); 941 in = TRUE; 942 if (!lp_setup_whole_tile(setup, &tri->inputs, x, y)) 943 goto fail; 944 } 945 946 /* Iterate cx values across the region: */ 947 for (i = 0; i < nr_planes; i++) 948 cx[i] += xstep[i]; 949 } 950 951 /* Iterate c values down the region: */ 952 for (i = 0; i < nr_planes; i++) 953 c[i] += ystep[i]; 954 } 955 } 956 957 return TRUE; 958 959 fail: 960 /* Need to disable any partially binned triangle. This is easier 961 * than trying to locate all the triangle, shade-tile, etc, 962 * commands which may have been binned. 963 */ 964 tri->inputs.disable = TRUE; 965 return FALSE; 966 } 967 968 969 /** 970 * Try to draw the triangle, restart the scene on failure. 971 */ 972 static void retry_triangle_ccw( struct lp_setup_context *setup, 973 struct fixed_position* position, 974 const float (*v0)[4], 975 const float (*v1)[4], 976 const float (*v2)[4], 977 boolean front) 978 { 979 if (!do_triangle_ccw( setup, position, v0, v1, v2, front )) 980 { 981 if (!lp_setup_flush_and_restart(setup)) 982 return; 983 984 if (!do_triangle_ccw( setup, position, v0, v1, v2, front )) 985 return; 986 } 987 } 988 989 /** 990 * Calculate fixed position data for a triangle 991 * It is unfortunate we need to do that here (as we need area 992 * calculated in fixed point), as there's quite some code duplication 993 * to what is done in the jit setup prog. 994 */ 995 static inline void 996 calc_fixed_position(struct lp_setup_context *setup, 997 struct fixed_position* position, 998 const float (*v0)[4], 999 const float (*v1)[4], 1000 const float (*v2)[4]) 1001 { 1002 /* 1003 * The rounding may not be quite the same with PIPE_ARCH_SSE 1004 * (util_iround right now only does nearest/even on x87, 1005 * otherwise nearest/away-from-zero). 1006 * Both should be acceptable, I think. 1007 */ 1008 #if defined(PIPE_ARCH_SSE) 1009 __m128 v0r, v1r; 1010 __m128 vxy0xy2, vxy1xy0; 1011 __m128i vxy0xy2i, vxy1xy0i; 1012 __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120; 1013 __m128 pix_offset = _mm_set1_ps(setup->pixel_offset); 1014 __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE); 1015 v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0])); 1016 vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]); 1017 v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0])); 1018 vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2); 1019 vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset); 1020 vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset); 1021 vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one); 1022 vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one); 1023 vxy0xy2i = _mm_cvtps_epi32(vxy0xy2); 1024 vxy1xy0i = _mm_cvtps_epi32(vxy1xy0); 1025 dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i); 1026 _mm_store_si128((__m128i *)&position->dx01, dxdy0120); 1027 /* 1028 * For the mul, would need some more shuffles, plus emulation 1029 * for the signed mul (without sse41), so don't bother. 1030 */ 1031 x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0)); 1032 x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0)); 1033 x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0); 1034 y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0); 1035 _mm_store_si128((__m128i *)&position->x[0], x0120); 1036 _mm_store_si128((__m128i *)&position->y[0], y0120); 1037 1038 #else 1039 position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset); 1040 position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset); 1041 position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset); 1042 position->x[3] = 0; // should be unused 1043 1044 position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset); 1045 position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset); 1046 position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset); 1047 position->y[3] = 0; // should be unused 1048 1049 position->dx01 = position->x[0] - position->x[1]; 1050 position->dy01 = position->y[0] - position->y[1]; 1051 1052 position->dx20 = position->x[2] - position->x[0]; 1053 position->dy20 = position->y[2] - position->y[0]; 1054 #endif 1055 1056 position->area = IMUL64(position->dx01, position->dy20) - 1057 IMUL64(position->dx20, position->dy01); 1058 } 1059 1060 1061 /** 1062 * Rotate a triangle, flipping its clockwise direction, 1063 * Swaps values for xy[0] and xy[1] 1064 */ 1065 static inline void 1066 rotate_fixed_position_01( struct fixed_position* position ) 1067 { 1068 int x, y; 1069 1070 x = position->x[1]; 1071 y = position->y[1]; 1072 position->x[1] = position->x[0]; 1073 position->y[1] = position->y[0]; 1074 position->x[0] = x; 1075 position->y[0] = y; 1076 1077 position->dx01 = -position->dx01; 1078 position->dy01 = -position->dy01; 1079 position->dx20 = position->x[2] - position->x[0]; 1080 position->dy20 = position->y[2] - position->y[0]; 1081 1082 position->area = -position->area; 1083 } 1084 1085 1086 /** 1087 * Rotate a triangle, flipping its clockwise direction, 1088 * Swaps values for xy[1] and xy[2] 1089 */ 1090 static inline void 1091 rotate_fixed_position_12( struct fixed_position* position ) 1092 { 1093 int x, y; 1094 1095 x = position->x[2]; 1096 y = position->y[2]; 1097 position->x[2] = position->x[1]; 1098 position->y[2] = position->y[1]; 1099 position->x[1] = x; 1100 position->y[1] = y; 1101 1102 x = position->dx01; 1103 y = position->dy01; 1104 position->dx01 = -position->dx20; 1105 position->dy01 = -position->dy20; 1106 position->dx20 = -x; 1107 position->dy20 = -y; 1108 1109 position->area = -position->area; 1110 } 1111 1112 1113 /** 1114 * Draw triangle if it's CW, cull otherwise. 1115 */ 1116 static void triangle_cw(struct lp_setup_context *setup, 1117 const float (*v0)[4], 1118 const float (*v1)[4], 1119 const float (*v2)[4]) 1120 { 1121 PIPE_ALIGN_VAR(16) struct fixed_position position; 1122 1123 calc_fixed_position(setup, &position, v0, v1, v2); 1124 1125 if (position.area < 0) { 1126 if (setup->flatshade_first) { 1127 rotate_fixed_position_12(&position); 1128 retry_triangle_ccw(setup, &position, v0, v2, v1, !setup->ccw_is_frontface); 1129 } else { 1130 rotate_fixed_position_01(&position); 1131 retry_triangle_ccw(setup, &position, v1, v0, v2, !setup->ccw_is_frontface); 1132 } 1133 } 1134 } 1135 1136 1137 static void triangle_ccw(struct lp_setup_context *setup, 1138 const float (*v0)[4], 1139 const float (*v1)[4], 1140 const float (*v2)[4]) 1141 { 1142 PIPE_ALIGN_VAR(16) struct fixed_position position; 1143 1144 calc_fixed_position(setup, &position, v0, v1, v2); 1145 1146 if (position.area > 0) 1147 retry_triangle_ccw(setup, &position, v0, v1, v2, setup->ccw_is_frontface); 1148 } 1149 1150 /** 1151 * Draw triangle whether it's CW or CCW. 1152 */ 1153 static void triangle_both(struct lp_setup_context *setup, 1154 const float (*v0)[4], 1155 const float (*v1)[4], 1156 const float (*v2)[4]) 1157 { 1158 PIPE_ALIGN_VAR(16) struct fixed_position position; 1159 struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe; 1160 1161 if (lp_context->active_statistics_queries && 1162 !llvmpipe_rasterization_disabled(lp_context)) { 1163 lp_context->pipeline_statistics.c_primitives++; 1164 } 1165 1166 calc_fixed_position(setup, &position, v0, v1, v2); 1167 1168 if (0) { 1169 assert(!util_is_inf_or_nan(v0[0][0])); 1170 assert(!util_is_inf_or_nan(v0[0][1])); 1171 assert(!util_is_inf_or_nan(v1[0][0])); 1172 assert(!util_is_inf_or_nan(v1[0][1])); 1173 assert(!util_is_inf_or_nan(v2[0][0])); 1174 assert(!util_is_inf_or_nan(v2[0][1])); 1175 } 1176 1177 if (position.area > 0) 1178 retry_triangle_ccw( setup, &position, v0, v1, v2, setup->ccw_is_frontface ); 1179 else if (position.area < 0) { 1180 if (setup->flatshade_first) { 1181 rotate_fixed_position_12( &position ); 1182 retry_triangle_ccw( setup, &position, v0, v2, v1, !setup->ccw_is_frontface ); 1183 } else { 1184 rotate_fixed_position_01( &position ); 1185 retry_triangle_ccw( setup, &position, v1, v0, v2, !setup->ccw_is_frontface ); 1186 } 1187 } 1188 } 1189 1190 1191 static void triangle_nop( struct lp_setup_context *setup, 1192 const float (*v0)[4], 1193 const float (*v1)[4], 1194 const float (*v2)[4] ) 1195 { 1196 } 1197 1198 1199 void 1200 lp_setup_choose_triangle( struct lp_setup_context *setup ) 1201 { 1202 switch (setup->cullmode) { 1203 case PIPE_FACE_NONE: 1204 setup->triangle = triangle_both; 1205 break; 1206 case PIPE_FACE_BACK: 1207 setup->triangle = setup->ccw_is_frontface ? triangle_ccw : triangle_cw; 1208 break; 1209 case PIPE_FACE_FRONT: 1210 setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw; 1211 break; 1212 default: 1213 setup->triangle = triangle_nop; 1214 break; 1215 } 1216 } 1217