1 /************************************************************************** 2 * 3 * Copyright 2007-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 /* 29 * Rasterization for binned triangles within a tile 30 */ 31 32 33 34 /** 35 * Prototype for a 8 plane rasterizer function. Will codegenerate 36 * several of these. 37 * 38 * XXX: Varients for more/fewer planes. 39 * XXX: Need ways of dropping planes as we descend. 40 * XXX: SIMD 41 */ 42 static void 43 TAG(do_block_4)(struct lp_rasterizer_task *task, 44 const struct lp_rast_triangle *tri, 45 const struct lp_rast_plane *plane, 46 int x, int y, 47 const int64_t *c) 48 { 49 unsigned mask = 0xffff; 50 int j; 51 52 for (j = 0; j < NR_PLANES; j++) { 53 #ifdef RASTER_64 54 mask &= ~BUILD_MASK_LINEAR(((c[j] - 1) >> (int64_t)FIXED_ORDER), 55 -plane[j].dcdx >> FIXED_ORDER, 56 plane[j].dcdy >> FIXED_ORDER); 57 #else 58 mask &= ~BUILD_MASK_LINEAR((c[j] - 1), 59 -plane[j].dcdx, 60 plane[j].dcdy); 61 #endif 62 } 63 64 /* Now pass to the shader: 65 */ 66 if (mask) 67 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); 68 } 69 70 /** 71 * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out 72 * of the triangle's bounds. 73 */ 74 static void 75 TAG(do_block_16)(struct lp_rasterizer_task *task, 76 const struct lp_rast_triangle *tri, 77 const struct lp_rast_plane *plane, 78 int x, int y, 79 const int64_t *c) 80 { 81 unsigned outmask, inmask, partmask, partial_mask; 82 unsigned j; 83 84 outmask = 0; /* outside one or more trivial reject planes */ 85 partmask = 0; /* outside one or more trivial accept planes */ 86 87 for (j = 0; j < NR_PLANES; j++) { 88 #ifdef RASTER_64 89 int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER; 90 int32_t dcdy = plane[j].dcdy >> FIXED_ORDER; 91 const int32_t cox = plane[j].eo >> FIXED_ORDER; 92 const int32_t ei = (dcdy + dcdx - cox) << 2; 93 const int32_t cox_s = cox << 2; 94 const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s; 95 int32_t cdiff; 96 cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) - 97 (int32_t)(c[j] >> (int64_t)FIXED_ORDER)); 98 dcdx <<= 2; 99 dcdy <<= 2; 100 #else 101 const int64_t dcdx = -IMUL64(plane[j].dcdx, 4); 102 const int64_t dcdy = IMUL64(plane[j].dcdy, 4); 103 const int64_t cox = IMUL64(plane[j].eo, 4); 104 const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo; 105 const int64_t cio = IMUL64(ei, 4) - 1; 106 int32_t co, cdiff; 107 co = c[j] + cox; 108 cdiff = cio - cox; 109 #endif 110 111 BUILD_MASKS(co, cdiff, 112 dcdx, dcdy, 113 &outmask, /* sign bits from c[i][0..15] + cox */ 114 &partmask); /* sign bits from c[i][0..15] + cio */ 115 } 116 117 if (outmask == 0xffff) 118 return; 119 120 /* Mask of sub-blocks which are inside all trivial accept planes: 121 */ 122 inmask = ~partmask & 0xffff; 123 124 /* Mask of sub-blocks which are inside all trivial reject planes, 125 * but outside at least one trivial accept plane: 126 */ 127 partial_mask = partmask & ~outmask; 128 129 assert((partial_mask & inmask) == 0); 130 131 LP_COUNT_ADD(nr_empty_4, util_bitcount(0xffff & ~(partial_mask | inmask))); 132 133 /* Iterate over partials: 134 */ 135 while (partial_mask) { 136 int i = ffs(partial_mask) - 1; 137 int ix = (i & 3) * 4; 138 int iy = (i >> 2) * 4; 139 int px = x + ix; 140 int py = y + iy; 141 int64_t cx[NR_PLANES]; 142 143 partial_mask &= ~(1 << i); 144 145 LP_COUNT(nr_partially_covered_4); 146 147 for (j = 0; j < NR_PLANES; j++) 148 cx[j] = (c[j] 149 - IMUL64(plane[j].dcdx, ix) 150 + IMUL64(plane[j].dcdy, iy)); 151 152 TAG(do_block_4)(task, tri, plane, px, py, cx); 153 } 154 155 /* Iterate over fulls: 156 */ 157 while (inmask) { 158 int i = ffs(inmask) - 1; 159 int ix = (i & 3) * 4; 160 int iy = (i >> 2) * 4; 161 int px = x + ix; 162 int py = y + iy; 163 164 inmask &= ~(1 << i); 165 166 LP_COUNT(nr_fully_covered_4); 167 block_full_4(task, tri, px, py); 168 } 169 } 170 171 172 /** 173 * Scan the tile in chunks and figure out which pixels to rasterize 174 * for this triangle. 175 */ 176 void 177 TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, 178 const union lp_rast_cmd_arg arg) 179 { 180 const struct lp_rast_triangle *tri = arg.triangle.tri; 181 unsigned plane_mask = arg.triangle.plane_mask; 182 const struct lp_rast_plane *tri_plane = GET_PLANES(tri); 183 const int x = task->x, y = task->y; 184 struct lp_rast_plane plane[NR_PLANES]; 185 int64_t c[NR_PLANES]; 186 unsigned outmask, inmask, partmask, partial_mask; 187 unsigned j = 0; 188 189 if (tri->inputs.disable) { 190 /* This triangle was partially binned and has been disabled */ 191 return; 192 } 193 194 outmask = 0; /* outside one or more trivial reject planes */ 195 partmask = 0; /* outside one or more trivial accept planes */ 196 197 while (plane_mask) { 198 int i = ffs(plane_mask) - 1; 199 plane[j] = tri_plane[i]; 200 plane_mask &= ~(1 << i); 201 c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x); 202 203 { 204 #ifdef RASTER_64 205 /* 206 * Strip off lower FIXED_ORDER bits. Note that those bits from 207 * dcdx, dcdy, eo are always 0 (by definition). 208 * c values, however, are not. This means that for every 209 * addition of the form c + n*dcdx the lower FIXED_ORDER bits will 210 * NOT change. And those bits are not relevant to the sign bit (which 211 * is only what we need!) that is, 212 * sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER)) 213 * This means we can get away with using 32bit math for the most part. 214 * Only tricky part is the -1 adjustment for cdiff. 215 */ 216 int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER; 217 int32_t dcdy = plane[j].dcdy >> FIXED_ORDER; 218 const int32_t cox = plane[j].eo >> FIXED_ORDER; 219 const int32_t ei = (dcdy + dcdx - cox) << 4; 220 const int32_t cox_s = cox << 4; 221 const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s; 222 int32_t cdiff; 223 /* 224 * Plausibility check to ensure the 32bit math works. 225 * Note that within a tile, the max we can move the edge function 226 * is essentially dcdx * TILE_SIZE + dcdy * TILE_SIZE. 227 * TILE_SIZE is 64, dcdx/dcdy are nominally 21 bit (for 8192 max size 228 * and 8 subpixel bits), I'd be happy with 2 bits more too (1 for 229 * increasing fb size to 16384, the required d3d11 value, another one 230 * because I'm not quite sure we can't be _just_ above the max value 231 * here). This gives us 30 bits max - hence if c would exceed that here 232 * that means the plane is either trivial reject for the whole tile 233 * (in which case the tri will not get binned), or trivial accept for 234 * the whole tile (in which case plane_mask will not include it). 235 */ 236 assert((c[j] >> (int64_t)FIXED_ORDER) > (int32_t)0xb0000000 && 237 (c[j] >> (int64_t)FIXED_ORDER) < (int32_t)0x3fffffff); 238 /* 239 * Note the fixup part is constant throughout the tile - thus could 240 * just calculate this and avoid _all_ 64bit math in rasterization 241 * (except exactly this fixup calc). 242 * In fact theoretically could move that even to setup, albeit that 243 * seems tricky (pre-bin certainly can have values larger than 32bit, 244 * and would need to communicate that fixup value through). 245 * And if we want to support msaa, we'd probably don't want to do the 246 * downscaling in setup in any case... 247 */ 248 cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) - 249 (int32_t)(c[j] >> (int64_t)FIXED_ORDER)); 250 dcdx <<= 4; 251 dcdy <<= 4; 252 #else 253 const int32_t dcdx = -plane[j].dcdx << 4; 254 const int32_t dcdy = plane[j].dcdy << 4; 255 const int32_t cox = plane[j].eo << 4; 256 const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int32_t)plane[j].eo; 257 const int32_t cio = (ei << 4) - 1; 258 int32_t co, cdiff; 259 co = c[j] + cox; 260 cdiff = cio - cox; 261 #endif 262 BUILD_MASKS(co, cdiff, 263 dcdx, dcdy, 264 &outmask, /* sign bits from c[i][0..15] + cox */ 265 &partmask); /* sign bits from c[i][0..15] + cio */ 266 } 267 268 j++; 269 } 270 271 if (outmask == 0xffff) 272 return; 273 274 /* Mask of sub-blocks which are inside all trivial accept planes: 275 */ 276 inmask = ~partmask & 0xffff; 277 278 /* Mask of sub-blocks which are inside all trivial reject planes, 279 * but outside at least one trivial accept plane: 280 */ 281 partial_mask = partmask & ~outmask; 282 283 assert((partial_mask & inmask) == 0); 284 285 LP_COUNT_ADD(nr_empty_16, util_bitcount(0xffff & ~(partial_mask | inmask))); 286 287 /* Iterate over partials: 288 */ 289 while (partial_mask) { 290 int i = ffs(partial_mask) - 1; 291 int ix = (i & 3) * 16; 292 int iy = (i >> 2) * 16; 293 int px = x + ix; 294 int py = y + iy; 295 int64_t cx[NR_PLANES]; 296 297 for (j = 0; j < NR_PLANES; j++) 298 cx[j] = (c[j] 299 - IMUL64(plane[j].dcdx, ix) 300 + IMUL64(plane[j].dcdy, iy)); 301 302 partial_mask &= ~(1 << i); 303 304 LP_COUNT(nr_partially_covered_16); 305 TAG(do_block_16)(task, tri, plane, px, py, cx); 306 } 307 308 /* Iterate over fulls: 309 */ 310 while (inmask) { 311 int i = ffs(inmask) - 1; 312 int ix = (i & 3) * 16; 313 int iy = (i >> 2) * 16; 314 int px = x + ix; 315 int py = y + iy; 316 317 inmask &= ~(1 << i); 318 319 LP_COUNT(nr_fully_covered_16); 320 block_full_16(task, tri, px, py); 321 } 322 } 323 324 #if defined(PIPE_ARCH_SSE) && defined(TRI_16) 325 /* XXX: special case this when intersection is not required. 326 * - tile completely within bbox, 327 * - bbox completely within tile. 328 */ 329 void 330 TRI_16(struct lp_rasterizer_task *task, 331 const union lp_rast_cmd_arg arg) 332 { 333 const struct lp_rast_triangle *tri = arg.triangle.tri; 334 const struct lp_rast_plane *plane = GET_PLANES(tri); 335 unsigned mask = arg.triangle.plane_mask; 336 unsigned outmask, partial_mask; 337 unsigned j; 338 __m128i cstep4[NR_PLANES][4]; 339 340 int x = (mask & 0xff); 341 int y = (mask >> 8); 342 343 outmask = 0; /* outside one or more trivial reject planes */ 344 345 x += task->x; 346 y += task->y; 347 348 for (j = 0; j < NR_PLANES; j++) { 349 const int dcdx = -plane[j].dcdx * 4; 350 const int dcdy = plane[j].dcdy * 4; 351 __m128i xdcdy = _mm_set1_epi32(dcdy); 352 353 cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3); 354 cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy); 355 cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy); 356 cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy); 357 358 { 359 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; 360 const int cox = plane[j].eo * 4; 361 362 outmask |= sign_bits4(cstep4[j], c + cox); 363 } 364 } 365 366 if (outmask == 0xffff) 367 return; 368 369 370 /* Mask of sub-blocks which are inside all trivial reject planes, 371 * but outside at least one trivial accept plane: 372 */ 373 partial_mask = 0xffff & ~outmask; 374 375 /* Iterate over partials: 376 */ 377 while (partial_mask) { 378 int i = ffs(partial_mask) - 1; 379 int ix = (i & 3) * 4; 380 int iy = (i >> 2) * 4; 381 int px = x + ix; 382 int py = y + iy; 383 unsigned mask = 0xffff; 384 385 partial_mask &= ~(1 << i); 386 387 for (j = 0; j < NR_PLANES; j++) { 388 const int cx = (plane[j].c - 1 389 - plane[j].dcdx * px 390 + plane[j].dcdy * py) * 4; 391 392 mask &= ~sign_bits4(cstep4[j], cx); 393 } 394 395 if (mask) 396 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask); 397 } 398 } 399 #endif 400 401 #if defined(PIPE_ARCH_SSE) && defined(TRI_4) 402 void 403 TRI_4(struct lp_rasterizer_task *task, 404 const union lp_rast_cmd_arg arg) 405 { 406 const struct lp_rast_triangle *tri = arg.triangle.tri; 407 const struct lp_rast_plane *plane = GET_PLANES(tri); 408 unsigned mask = arg.triangle.plane_mask; 409 const int x = task->x + (mask & 0xff); 410 const int y = task->y + (mask >> 8); 411 unsigned j; 412 413 /* Iterate over partials: 414 */ 415 { 416 unsigned mask = 0xffff; 417 418 for (j = 0; j < NR_PLANES; j++) { 419 const int cx = (plane[j].c 420 - plane[j].dcdx * x 421 + plane[j].dcdy * y); 422 423 const int dcdx = -plane[j].dcdx; 424 const int dcdy = plane[j].dcdy; 425 __m128i xdcdy = _mm_set1_epi32(dcdy); 426 427 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3); 428 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 429 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 430 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 431 432 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 433 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 434 __m128i result = _mm_packs_epi16(cstep01, cstep23); 435 436 /* Extract the sign bits 437 */ 438 mask &= ~_mm_movemask_epi8(result); 439 } 440 441 if (mask) 442 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); 443 } 444 } 445 #endif 446 447 448 449 #undef TAG 450 #undef TRI_4 451 #undef TRI_16 452 #undef NR_PLANES 453 454