1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 #include "pipe/p_defines.h" 30 31 #include "util/u_format.h" 32 #include "util/u_memory.h" 33 #include "util/u_string.h" 34 #include "util/u_math.h" 35 36 #include "lp_bld_type.h" 37 #include "lp_bld_const.h" 38 #include "lp_bld_conv.h" 39 #include "lp_bld_swizzle.h" 40 #include "lp_bld_gather.h" 41 #include "lp_bld_debug.h" 42 #include "lp_bld_format.h" 43 #include "lp_bld_arit.h" 44 #include "lp_bld_pack.h" 45 46 47 static void 48 convert_to_soa(struct gallivm_state *gallivm, 49 LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32], 50 LLVMValueRef dst_soa[4], 51 const struct lp_type soa_type) 52 { 53 unsigned j, k; 54 struct lp_type aos_channel_type = soa_type; 55 56 LLVMValueRef aos_channels[4]; 57 unsigned pixels_per_channel = soa_type.length / 4; 58 59 debug_assert((soa_type.length % 4) == 0); 60 61 aos_channel_type.length >>= 1; 62 63 for (j = 0; j < 4; ++j) { 64 LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 }; 65 66 assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH); 67 68 for (k = 0; k < pixels_per_channel; ++k) { 69 channel[k] = src_aos[j + 4 * k]; 70 } 71 72 aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel); 73 } 74 75 lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa); 76 } 77 78 79 void 80 lp_build_format_swizzle_soa(const struct util_format_description *format_desc, 81 struct lp_build_context *bld, 82 const LLVMValueRef *unswizzled, 83 LLVMValueRef swizzled_out[4]) 84 { 85 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { 86 enum pipe_swizzle swizzle; 87 LLVMValueRef depth_or_stencil; 88 89 if (util_format_has_stencil(format_desc) && 90 !util_format_has_depth(format_desc)) { 91 assert(!bld->type.floating); 92 swizzle = format_desc->swizzle[1]; 93 } 94 else { 95 assert(bld->type.floating); 96 swizzle = format_desc->swizzle[0]; 97 } 98 /* 99 * Return zzz1 or sss1 for depth-stencil formats here. 100 * Correct swizzling will be handled by apply_sampler_swizzle() later. 101 */ 102 depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle); 103 104 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil; 105 swizzled_out[3] = bld->one; 106 } 107 else { 108 unsigned chan; 109 for (chan = 0; chan < 4; ++chan) { 110 enum pipe_swizzle swizzle = format_desc->swizzle[chan]; 111 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle); 112 } 113 } 114 } 115 116 117 118 static LLVMValueRef 119 lp_build_extract_soa_chan(struct lp_build_context *bld, 120 unsigned blockbits, 121 boolean srgb_chan, 122 struct util_format_channel_description chan_desc, 123 LLVMValueRef packed) 124 { 125 struct gallivm_state *gallivm = bld->gallivm; 126 LLVMBuilderRef builder = gallivm->builder; 127 struct lp_type type = bld->type; 128 LLVMValueRef input = packed; 129 const unsigned width = chan_desc.size; 130 const unsigned start = chan_desc.shift; 131 const unsigned stop = start + width; 132 133 /* Decode the input vector component */ 134 135 switch(chan_desc.type) { 136 case UTIL_FORMAT_TYPE_VOID: 137 input = bld->undef; 138 break; 139 140 case UTIL_FORMAT_TYPE_UNSIGNED: 141 /* 142 * Align the LSB 143 */ 144 if (start) { 145 input = LLVMBuildLShr(builder, input, 146 lp_build_const_int_vec(gallivm, type, start), ""); 147 } 148 149 /* 150 * Zero the MSBs 151 */ 152 if (stop < blockbits) { 153 unsigned mask = ((unsigned long long)1 << width) - 1; 154 input = LLVMBuildAnd(builder, input, 155 lp_build_const_int_vec(gallivm, type, mask), ""); 156 } 157 158 /* 159 * Type conversion 160 */ 161 if (type.floating) { 162 if (srgb_chan) { 163 struct lp_type conv_type = lp_uint_type(type); 164 input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); 165 } 166 else { 167 if(chan_desc.normalized) 168 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); 169 else 170 input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); 171 } 172 } 173 else if (chan_desc.pure_integer) { 174 /* Nothing to do */ 175 } else { 176 /* FIXME */ 177 assert(0); 178 } 179 break; 180 181 case UTIL_FORMAT_TYPE_SIGNED: 182 /* 183 * Align the sign bit first. 184 */ 185 if (stop < type.width) { 186 unsigned bits = type.width - stop; 187 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); 188 input = LLVMBuildShl(builder, input, bits_val, ""); 189 } 190 191 /* 192 * Align the LSB (with an arithmetic shift to preserve the sign) 193 */ 194 if (chan_desc.size < type.width) { 195 unsigned bits = type.width - chan_desc.size; 196 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); 197 input = LLVMBuildAShr(builder, input, bits_val, ""); 198 } 199 200 /* 201 * Type conversion 202 */ 203 if (type.floating) { 204 input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); 205 if (chan_desc.normalized) { 206 double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1); 207 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); 208 input = LLVMBuildFMul(builder, input, scale_val, ""); 209 /* 210 * The formula above will produce value below -1.0 for most negative 211 * value but everything seems happy with that hence disable for now. 212 */ 213 if (0) 214 input = lp_build_max(bld, input, 215 lp_build_const_vec(gallivm, type, -1.0f)); 216 } 217 } 218 else if (chan_desc.pure_integer) { 219 /* Nothing to do */ 220 } else { 221 /* FIXME */ 222 assert(0); 223 } 224 break; 225 226 case UTIL_FORMAT_TYPE_FLOAT: 227 if (type.floating) { 228 if (chan_desc.size == 16) { 229 struct lp_type f16i_type = type; 230 f16i_type.width /= 2; 231 f16i_type.floating = 0; 232 if (start) { 233 input = LLVMBuildLShr(builder, input, 234 lp_build_const_int_vec(gallivm, type, start), ""); 235 } 236 input = LLVMBuildTrunc(builder, input, 237 lp_build_vec_type(gallivm, f16i_type), ""); 238 input = lp_build_half_to_float(gallivm, input); 239 } else { 240 assert(start == 0); 241 assert(stop == 32); 242 assert(type.width == 32); 243 } 244 input = LLVMBuildBitCast(builder, input, bld->vec_type, ""); 245 } 246 else { 247 /* FIXME */ 248 assert(0); 249 input = bld->undef; 250 } 251 break; 252 253 case UTIL_FORMAT_TYPE_FIXED: 254 if (type.floating) { 255 double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1); 256 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); 257 input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); 258 input = LLVMBuildFMul(builder, input, scale_val, ""); 259 } 260 else { 261 /* FIXME */ 262 assert(0); 263 input = bld->undef; 264 } 265 break; 266 267 default: 268 assert(0); 269 input = bld->undef; 270 break; 271 } 272 273 return input; 274 } 275 276 277 /** 278 * Unpack several pixels in SoA. 279 * 280 * It takes a vector of packed pixels: 281 * 282 * packed = {P0, P1, P2, P3, ..., Pn} 283 * 284 * And will produce four vectors: 285 * 286 * red = {R0, R1, R2, R3, ..., Rn} 287 * green = {G0, G1, G2, G3, ..., Gn} 288 * blue = {B0, B1, B2, B3, ..., Bn} 289 * alpha = {A0, A1, A2, A3, ..., An} 290 * 291 * It requires that a packed pixel fits into an element of the output 292 * channels. The common case is when converting pixel with a depth of 32 bit or 293 * less into floats. 294 * 295 * \param format_desc the format of the 'packed' incoming pixel vector 296 * \param type the desired type for rgba_out (type.length = n, above) 297 * \param packed the incoming vector of packed pixels 298 * \param rgba_out returns the SoA R,G,B,A vectors 299 */ 300 void 301 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, 302 const struct util_format_description *format_desc, 303 struct lp_type type, 304 LLVMValueRef packed, 305 LLVMValueRef rgba_out[4]) 306 { 307 struct lp_build_context bld; 308 LLVMValueRef inputs[4]; 309 unsigned chan; 310 311 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); 312 assert(format_desc->block.width == 1); 313 assert(format_desc->block.height == 1); 314 assert(format_desc->block.bits <= type.width); 315 /* FIXME: Support more output types */ 316 assert(type.width == 32); 317 318 lp_build_context_init(&bld, gallivm, type); 319 320 /* Decode the input vector components */ 321 for (chan = 0; chan < format_desc->nr_channels; ++chan) { 322 struct util_format_channel_description chan_desc = format_desc->channel[chan]; 323 boolean srgb_chan = FALSE; 324 325 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && 326 format_desc->swizzle[3] != chan) { 327 srgb_chan = TRUE; 328 } 329 330 inputs[chan] = lp_build_extract_soa_chan(&bld, 331 format_desc->block.bits, 332 srgb_chan, 333 chan_desc, 334 packed); 335 } 336 337 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); 338 } 339 340 341 /** 342 * Convert a vector of rgba8 values into 32bit wide SoA vectors. 343 * 344 * \param dst_type The desired return type. For pure integer formats 345 * this should be a 32bit wide int or uint vector type, 346 * otherwise a float vector type. 347 * 348 * \param packed The rgba8 values to pack. 349 * 350 * \param rgba The 4 SoA return vectors. 351 */ 352 void 353 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm, 354 struct lp_type dst_type, 355 LLVMValueRef packed, 356 LLVMValueRef *rgba) 357 { 358 LLVMBuilderRef builder = gallivm->builder; 359 LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff); 360 unsigned chan; 361 362 /* XXX technically shouldn't use that for uint dst_type */ 363 packed = LLVMBuildBitCast(builder, packed, 364 lp_build_int_vec_type(gallivm, dst_type), ""); 365 366 /* Decode the input vector components */ 367 for (chan = 0; chan < 4; ++chan) { 368 #ifdef PIPE_ARCH_LITTLE_ENDIAN 369 unsigned start = chan*8; 370 #else 371 unsigned start = (3-chan)*8; 372 #endif 373 unsigned stop = start + 8; 374 LLVMValueRef input; 375 376 input = packed; 377 378 if (start) 379 input = LLVMBuildLShr(builder, input, 380 lp_build_const_int_vec(gallivm, dst_type, start), ""); 381 382 if (stop < 32) 383 input = LLVMBuildAnd(builder, input, mask, ""); 384 385 if (dst_type.floating) 386 input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input); 387 388 rgba[chan] = input; 389 } 390 } 391 392 393 394 /** 395 * Fetch a texels from a texture, returning them in SoA layout. 396 * 397 * \param type the desired return type for 'rgba'. The vector length 398 * is the number of texels to fetch 399 * \param aligned if the offset is guaranteed to be aligned to element width 400 * 401 * \param base_ptr points to the base of the texture mip tree. 402 * \param offset offset to start of the texture image block. For non- 403 * compressed formats, this simply is an offset to the texel. 404 * For compressed formats, it is an offset to the start of the 405 * compressed data block. 406 * 407 * \param i, j the sub-block pixel coordinates. For non-compressed formats 408 * these will always be (0,0). For compressed formats, i will 409 * be in [0, block_width-1] and j will be in [0, block_height-1]. 410 * \param cache optional value pointing to a lp_build_format_cache structure 411 */ 412 void 413 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, 414 const struct util_format_description *format_desc, 415 struct lp_type type, 416 boolean aligned, 417 LLVMValueRef base_ptr, 418 LLVMValueRef offset, 419 LLVMValueRef i, 420 LLVMValueRef j, 421 LLVMValueRef cache, 422 LLVMValueRef rgba_out[4]) 423 { 424 LLVMBuilderRef builder = gallivm->builder; 425 enum pipe_format format = format_desc->format; 426 struct lp_type fetch_type; 427 428 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && 429 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || 430 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || 431 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && 432 format_desc->block.width == 1 && 433 format_desc->block.height == 1 && 434 format_desc->block.bits <= type.width && 435 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || 436 format_desc->channel[0].size == 32 || 437 format_desc->channel[0].size == 16)) 438 { 439 /* 440 * The packed pixel fits into an element of the destination format. Put 441 * the packed pixels into a vector and extract each component for all 442 * vector elements in parallel. 443 */ 444 445 LLVMValueRef packed; 446 447 /* 448 * gather the texels from the texture 449 * Ex: packed = {XYZW, XYZW, XYZW, XYZW} 450 */ 451 assert(format_desc->block.bits <= type.width); 452 fetch_type = lp_type_uint(type.width); 453 packed = lp_build_gather(gallivm, 454 type.length, 455 format_desc->block.bits, 456 fetch_type, 457 aligned, 458 base_ptr, offset, FALSE); 459 460 /* 461 * convert texels to float rgba 462 */ 463 lp_build_unpack_rgba_soa(gallivm, 464 format_desc, 465 type, 466 packed, rgba_out); 467 return; 468 } 469 470 471 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && 472 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && 473 format_desc->block.width == 1 && 474 format_desc->block.height == 1 && 475 format_desc->block.bits > type.width && 476 ((format_desc->block.bits <= type.width * type.length && 477 format_desc->channel[0].size <= type.width) || 478 (format_desc->channel[0].size == 64 && 479 format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && 480 type.floating))) 481 { 482 /* 483 * Similar to above, but the packed pixel is larger than what fits 484 * into an element of the destination format. The packed pixels will be 485 * shuffled into SoA vectors appropriately, and then the extraction will 486 * be done in parallel as much as possible. 487 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so 488 * the gathered vectors can be shuffled easily (even with avx). 489 * 64xn float -> 32xn float is handled too but it's a bit special as 490 * it does the conversion pre-shuffle. 491 */ 492 493 LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; 494 struct lp_type fetch_type, gather_type = type; 495 unsigned num_gather, fetch_width, i, j; 496 struct lp_build_context bld; 497 boolean fp64 = format_desc->channel[0].size == 64; 498 499 lp_build_context_init(&bld, gallivm, type); 500 501 assert(type.width == 32); 502 assert(format_desc->block.bits > type.width); 503 504 /* 505 * First, figure out fetch order. 506 */ 507 fetch_width = util_next_power_of_two(format_desc->block.bits); 508 /* 509 * fp64 are treated like fp32 except we fetch twice wide values 510 * (as we shuffle after trunc). The shuffles for that work out 511 * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) 512 * albeit we miss the potential opportunity for hw gather (as it 513 * only handles native size). 514 */ 515 num_gather = fetch_width / type.width; 516 gather_type.width *= num_gather; 517 if (fp64) { 518 num_gather /= 2; 519 } 520 gather_type.length /= num_gather; 521 522 for (i = 0; i < num_gather; i++) { 523 LLVMValueRef offsetr, shuf_vec; 524 if(num_gather == 4) { 525 for (j = 0; j < gather_type.length; j++) { 526 unsigned idx = i + 4*j; 527 shuffles[j] = lp_build_const_int32(gallivm, idx); 528 } 529 shuf_vec = LLVMConstVector(shuffles, gather_type.length); 530 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); 531 532 } 533 else if (num_gather == 2) { 534 assert(num_gather == 2); 535 for (j = 0; j < gather_type.length; j++) { 536 unsigned idx = i*2 + (j%2) + (j/2)*4; 537 shuffles[j] = lp_build_const_int32(gallivm, idx); 538 } 539 shuf_vec = LLVMConstVector(shuffles, gather_type.length); 540 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); 541 } 542 else { 543 assert(num_gather == 1); 544 offsetr = offset; 545 } 546 if (gather_type.length == 1) { 547 LLVMValueRef zero = lp_build_const_int32(gallivm, 0); 548 offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); 549 } 550 551 /* 552 * Determine whether to use float or int loads. This is mostly 553 * to outsmart the (stupid) llvm int/float shuffle logic, we 554 * don't really care much if the data is floats or ints... 555 * But llvm will refuse to use single float shuffle with int data 556 * and instead use 3 int shuffles instead, the code looks atrocious. 557 * (Note bitcasts often won't help, as llvm is too smart to be 558 * fooled by that.) 559 * Nobody cares about simd float<->int domain transition penalties, 560 * which usually don't even exist for shuffles anyway. 561 * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is 562 * going into transpose, which is unpacks, so doesn't really matter 563 * much). 564 * With 2x32bit or 4x16bit fetch, we use float vec, since those 565 * go into the weird channel separation shuffle. With floats, 566 * this is (with 128bit vectors): 567 * - 2 movq, 2 movhpd, 2 shufps 568 * With ints it would be: 569 * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw 570 * I've seen texture functions increase in code size by 15% just due 571 * to that (there's lots of such fetches in them...) 572 * (We could chose a different gather order to improve this somewhat 573 * for the int path, but it would basically just drop the blends, 574 * so the float path with this order really is optimal.) 575 * Albeit it is tricky sometimes llvm doesn't ignore the float->int 576 * casts so must avoid them until we're done with the float shuffle... 577 * 3x16bit formats (the same is also true for 3x8) are pretty bad but 578 * there's nothing we can do about them (we could overallocate by 579 * those couple bytes and use unaligned but pot sized load). 580 * Note that this is very much x86 specific. I don't know if this 581 * affect other archs at all. 582 */ 583 if (num_gather > 1) { 584 /* 585 * We always want some float type here (with x86) 586 * due to shuffles being float ones afterwards (albeit for 587 * the num_gather == 4 case int should work fine too 588 * (unless there's some problems with avx but not avx2). 589 */ 590 if (format_desc->channel[0].size == 64) { 591 fetch_type = lp_type_float_vec(64, gather_type.width); 592 } else { 593 fetch_type = lp_type_int_vec(32, gather_type.width); 594 } 595 } 596 else { 597 /* type doesn't matter much */ 598 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && 599 (format_desc->channel[0].size == 32 || 600 format_desc->channel[0].size == 64)) { 601 fetch_type = lp_type_float(gather_type.width); 602 } else { 603 fetch_type = lp_type_uint(gather_type.width); 604 } 605 } 606 607 /* Now finally gather the values */ 608 packed[i] = lp_build_gather(gallivm, gather_type.length, 609 format_desc->block.bits, 610 fetch_type, aligned, 611 base_ptr, offsetr, FALSE); 612 if (fp64) { 613 struct lp_type conv_type = type; 614 conv_type.width *= 2; 615 packed[i] = LLVMBuildBitCast(builder, packed[i], 616 lp_build_vec_type(gallivm, conv_type), ""); 617 packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); 618 } 619 } 620 621 /* shuffle the gathered values to SoA */ 622 if (num_gather == 2) { 623 for (i = 0; i < num_gather; i++) { 624 for (j = 0; j < type.length; j++) { 625 unsigned idx = (j%2)*2 + (j/4)*4 + i; 626 if ((j/2)%2) 627 idx += type.length; 628 shuffles[j] = lp_build_const_int32(gallivm, idx); 629 } 630 dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], 631 LLVMConstVector(shuffles, type.length), ""); 632 } 633 } 634 else if (num_gather == 4) { 635 lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); 636 } 637 else { 638 assert(num_gather == 1); 639 dst[0] = packed[0]; 640 } 641 642 /* 643 * And finally unpack exactly as above, except that 644 * chan shift is adjusted and the right vector selected. 645 */ 646 if (!fp64) { 647 for (i = 0; i < num_gather; i++) { 648 dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); 649 } 650 for (i = 0; i < format_desc->nr_channels; i++) { 651 struct util_format_channel_description chan_desc = format_desc->channel[i]; 652 unsigned blockbits = type.width; 653 unsigned vec_nr; 654 655 #ifdef PIPE_ARCH_BIG_ENDIAN 656 vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width; 657 #else 658 vec_nr = chan_desc.shift / type.width; 659 #endif 660 chan_desc.shift %= type.width; 661 662 output[i] = lp_build_extract_soa_chan(&bld, 663 blockbits, 664 FALSE, 665 chan_desc, 666 dst[vec_nr]); 667 } 668 } 669 else { 670 for (i = 0; i < format_desc->nr_channels; i++) { 671 output[i] = dst[i]; 672 } 673 } 674 675 lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); 676 return; 677 } 678 679 if (format == PIPE_FORMAT_R11G11B10_FLOAT || 680 format == PIPE_FORMAT_R9G9B9E5_FLOAT) { 681 /* 682 * similar conceptually to above but requiring special 683 * AoS packed -> SoA float conversion code. 684 */ 685 LLVMValueRef packed; 686 struct lp_type fetch_type = lp_type_uint(type.width); 687 688 assert(type.floating); 689 assert(type.width == 32); 690 691 packed = lp_build_gather(gallivm, type.length, 692 format_desc->block.bits, 693 fetch_type, aligned, 694 base_ptr, offset, FALSE); 695 if (format == PIPE_FORMAT_R11G11B10_FLOAT) { 696 lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); 697 } 698 else { 699 lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); 700 } 701 return; 702 } 703 704 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && 705 format_desc->block.bits == 64) { 706 /* 707 * special case the format is 64 bits but we only require 708 * 32bit (or 8bit) from each block. 709 */ 710 LLVMValueRef packed; 711 struct lp_type fetch_type = lp_type_uint(type.width); 712 713 if (format == PIPE_FORMAT_X32_S8X24_UINT) { 714 /* 715 * for stencil simply fix up offsets - could in fact change 716 * base_ptr instead even outside the shader. 717 */ 718 unsigned mask = (1 << 8) - 1; 719 LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); 720 offset = LLVMBuildAdd(builder, offset, s_offset, ""); 721 packed = lp_build_gather(gallivm, type.length, 32, fetch_type, 722 aligned, base_ptr, offset, FALSE); 723 packed = LLVMBuildAnd(builder, packed, 724 lp_build_const_int_vec(gallivm, type, mask), ""); 725 } 726 else { 727 assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); 728 packed = lp_build_gather(gallivm, type.length, 32, fetch_type, 729 aligned, base_ptr, offset, TRUE); 730 packed = LLVMBuildBitCast(builder, packed, 731 lp_build_vec_type(gallivm, type), ""); 732 } 733 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ 734 rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; 735 rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); 736 return; 737 } 738 739 /* 740 * Try calling lp_build_fetch_rgba_aos for all pixels. 741 * Should only really hit subsampled, compressed 742 * (for s3tc srgb too, for rgtc the unorm ones only) by now. 743 * (This is invalid for plain 8unorm formats because we're lazy with 744 * the swizzle since some results would arrive swizzled, some not.) 745 */ 746 747 if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) && 748 (util_format_fits_8unorm(format_desc) || 749 format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) && 750 type.floating && type.width == 32 && 751 (type.length == 1 || (type.length % 4 == 0))) { 752 struct lp_type tmp_type; 753 struct lp_build_context bld; 754 LLVMValueRef packed, rgba[4]; 755 const struct util_format_description *flinear_desc; 756 const struct util_format_description *frgba8_desc; 757 unsigned chan; 758 759 lp_build_context_init(&bld, gallivm, type); 760 761 /* 762 * Make sure the conversion in aos really only does convert to rgba8 763 * and not anything more (so use linear format, adjust type). 764 */ 765 flinear_desc = util_format_description(util_format_linear(format)); 766 memset(&tmp_type, 0, sizeof tmp_type); 767 tmp_type.width = 8; 768 tmp_type.length = type.length * 4; 769 tmp_type.norm = TRUE; 770 771 packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type, 772 aligned, base_ptr, offset, i, j, cache); 773 packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, ""); 774 775 /* 776 * The values are now packed so they match ordinary (srgb) RGBA8 format, 777 * hence need to use matching format for unpack. 778 */ 779 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM); 780 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { 781 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); 782 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); 783 } 784 lp_build_unpack_rgba_soa(gallivm, 785 frgba8_desc, 786 type, 787 packed, rgba); 788 789 /* 790 * We converted 4 channels. Make sure llvm can drop unneeded ones 791 * (luckily the rgba order is fixed, only LA needs special case). 792 */ 793 for (chan = 0; chan < 4; chan++) { 794 enum pipe_swizzle swizzle = format_desc->swizzle[chan]; 795 if (chan == 3 && util_format_is_luminance_alpha(format)) { 796 swizzle = PIPE_SWIZZLE_W; 797 } 798 rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle); 799 } 800 return; 801 } 802 803 804 /* 805 * Fallback to calling lp_build_fetch_rgba_aos for each pixel. 806 * 807 * This is not the most efficient way of fetching pixels, as we 808 * miss some opportunities to do vectorization, but this is 809 * convenient for formats or scenarios for which there was no 810 * opportunity or incentive to optimize. 811 * 812 * We do NOT want to end up here, this typically is quite terrible, 813 * in particular if the formats have less than 4 channels. 814 * 815 * Right now, this should only be hit for: 816 * - RGTC snorm formats 817 * (those miss fast fetch functions hence they are terrible anyway) 818 */ 819 820 { 821 unsigned k; 822 struct lp_type tmp_type; 823 LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; 824 825 if (gallivm_debug & GALLIVM_DEBUG_PERF) { 826 debug_printf("%s: AoS fetch fallback for %s\n", 827 __FUNCTION__, format_desc->short_name); 828 } 829 830 tmp_type = type; 831 tmp_type.length = 4; 832 833 /* 834 * Note that vector transpose can be worse compared to insert/extract 835 * for aos->soa conversion (for formats with 1 or 2 channels). However, 836 * we should try to avoid getting here for just about all formats, so 837 * don't bother. 838 */ 839 840 /* loop over number of pixels */ 841 for(k = 0; k < type.length; ++k) { 842 LLVMValueRef index = lp_build_const_int32(gallivm, k); 843 LLVMValueRef offset_elem; 844 LLVMValueRef i_elem, j_elem; 845 846 offset_elem = LLVMBuildExtractElement(builder, offset, 847 index, ""); 848 849 i_elem = LLVMBuildExtractElement(builder, i, index, ""); 850 j_elem = LLVMBuildExtractElement(builder, j, index, ""); 851 852 /* Get a single float[4]={R,G,B,A} pixel */ 853 aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, 854 aligned, base_ptr, offset_elem, 855 i_elem, j_elem, cache); 856 857 } 858 convert_to_soa(gallivm, aos_fetch, rgba_out, type); 859 } 860 } 861