1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 /** 30 * @file 31 * Helper functions for packing/unpacking. 32 * 33 * Pack/unpacking is necessary for conversion between types of different 34 * bit width. 35 * 36 * They are also commonly used when an computation needs higher 37 * precision for the intermediate values. For example, if one needs the 38 * function: 39 * 40 * c = compute(a, b); 41 * 42 * to use more precision for intermediate results then one should implement it 43 * as: 44 * 45 * LLVMValueRef 46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b) 47 * { 48 * struct lp_type wide_type = lp_wider_type(type); 49 * LLVMValueRef al, ah, bl, bh, cl, ch, c; 50 * 51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah); 52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh); 53 * 54 * cl = compute_half(al, bl); 55 * ch = compute_half(ah, bh); 56 * 57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch); 58 * 59 * return c; 60 * } 61 * 62 * where compute_half() would do the computation for half the elements with 63 * twice the precision. 64 * 65 * @author Jose Fonseca <jfonseca (at) vmware.com> 66 */ 67 68 69 #include "util/u_debug.h" 70 #include "util/u_math.h" 71 #include "util/u_cpu_detect.h" 72 #include "util/u_memory.h" 73 74 #include "lp_bld_type.h" 75 #include "lp_bld_const.h" 76 #include "lp_bld_init.h" 77 #include "lp_bld_intr.h" 78 #include "lp_bld_arit.h" 79 #include "lp_bld_pack.h" 80 #include "lp_bld_swizzle.h" 81 82 83 /** 84 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions. 85 */ 86 static LLVMValueRef 87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm, 88 unsigned n, unsigned lo_hi) 89 { 90 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 91 unsigned i, j; 92 93 assert(n <= LP_MAX_VECTOR_LENGTH); 94 assert(lo_hi < 2); 95 96 /* TODO: cache results in a static table */ 97 98 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) { 99 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); 100 elems[i + 1] = lp_build_const_int32(gallivm, n + j); 101 } 102 103 return LLVMConstVector(elems, n); 104 } 105 106 /** 107 * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack. 108 * See comment above lp_build_interleave2_half for more details. 109 */ 110 static LLVMValueRef 111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm, 112 unsigned n, unsigned lo_hi) 113 { 114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 115 unsigned i, j; 116 117 assert(n <= LP_MAX_VECTOR_LENGTH); 118 assert(lo_hi < 2); 119 120 for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) { 121 if (i == (n / 2)) 122 j += n / 4; 123 124 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); 125 elems[i + 1] = lp_build_const_int32(gallivm, n + j); 126 } 127 128 return LLVMConstVector(elems, n); 129 } 130 131 /** 132 * Build shuffle vectors that match PACKxx instructions. 133 */ 134 static LLVMValueRef 135 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n) 136 { 137 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 138 unsigned i; 139 140 assert(n <= LP_MAX_VECTOR_LENGTH); 141 142 for(i = 0; i < n; ++i) 143 elems[i] = lp_build_const_int32(gallivm, 2*i); 144 145 return LLVMConstVector(elems, n); 146 } 147 148 /** 149 * Return a vector with elements src[start:start+size] 150 * Most useful for getting half the values out of a 256bit sized vector, 151 * otherwise may cause data rearrangement to happen. 152 */ 153 LLVMValueRef 154 lp_build_extract_range(struct gallivm_state *gallivm, 155 LLVMValueRef src, 156 unsigned start, 157 unsigned size) 158 { 159 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 160 unsigned i; 161 162 assert(size <= Elements(elems)); 163 164 for (i = 0; i < size; ++i) 165 elems[i] = lp_build_const_int32(gallivm, i + start); 166 167 if (size == 1) { 168 return LLVMBuildExtractElement(gallivm->builder, src, elems[0], ""); 169 } 170 else { 171 return LLVMBuildShuffleVector(gallivm->builder, src, src, 172 LLVMConstVector(elems, size), ""); 173 } 174 } 175 176 /** 177 * Concatenates several (must be a power of 2) vectors (of same type) 178 * into a larger one. 179 * Most useful for building up a 256bit sized vector out of two 128bit ones. 180 */ 181 LLVMValueRef 182 lp_build_concat(struct gallivm_state *gallivm, 183 LLVMValueRef src[], 184 struct lp_type src_type, 185 unsigned num_vectors) 186 { 187 unsigned new_length, i; 188 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2]; 189 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; 190 191 assert(src_type.length * num_vectors <= Elements(shuffles)); 192 assert(util_is_power_of_two(num_vectors)); 193 194 new_length = src_type.length; 195 196 for (i = 0; i < num_vectors; i++) 197 tmp[i] = src[i]; 198 199 while (num_vectors > 1) { 200 num_vectors >>= 1; 201 new_length <<= 1; 202 for (i = 0; i < new_length; i++) { 203 shuffles[i] = lp_build_const_int32(gallivm, i); 204 } 205 for (i = 0; i < num_vectors; i++) { 206 tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1], 207 LLVMConstVector(shuffles, new_length), ""); 208 } 209 } 210 211 return tmp[0]; 212 } 213 214 /** 215 * Interleave vector elements. 216 * 217 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions. 218 */ 219 LLVMValueRef 220 lp_build_interleave2(struct gallivm_state *gallivm, 221 struct lp_type type, 222 LLVMValueRef a, 223 LLVMValueRef b, 224 unsigned lo_hi) 225 { 226 LLVMValueRef shuffle; 227 228 shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi); 229 230 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); 231 } 232 233 /** 234 * Interleave vector elements but with 256 bit, 235 * treats it as interleave with 2 concatenated 128 bit vectors. 236 * 237 * This differs to lp_build_interleave2 as that function would do the following (for lo): 238 * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction. 239 * 240 * 241 * An example interleave 8x float with 8x float on AVX 256bit unpack: 242 * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7 243 * 244 * Equivalent to interleaving 2x 128 bit vectors 245 * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7 246 * 247 * So interleave-lo would result in: 248 * a0 b0 a1 b1 a4 b4 a5 b5 249 * 250 * And interleave-hi would result in: 251 * a2 b2 a3 b3 a6 b6 a7 b7 252 */ 253 LLVMValueRef 254 lp_build_interleave2_half(struct gallivm_state *gallivm, 255 struct lp_type type, 256 LLVMValueRef a, 257 LLVMValueRef b, 258 unsigned lo_hi) 259 { 260 if (type.length * type.width == 256) { 261 LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi); 262 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); 263 } else { 264 return lp_build_interleave2(gallivm, type, a, b, lo_hi); 265 } 266 } 267 268 /** 269 * Double the bit width. 270 * 271 * This will only change the number of bits the values are represented, not the 272 * values themselves. 273 */ 274 void 275 lp_build_unpack2(struct gallivm_state *gallivm, 276 struct lp_type src_type, 277 struct lp_type dst_type, 278 LLVMValueRef src, 279 LLVMValueRef *dst_lo, 280 LLVMValueRef *dst_hi) 281 { 282 LLVMBuilderRef builder = gallivm->builder; 283 LLVMValueRef msb; 284 LLVMTypeRef dst_vec_type; 285 286 assert(!src_type.floating); 287 assert(!dst_type.floating); 288 assert(dst_type.width == src_type.width * 2); 289 assert(dst_type.length * 2 == src_type.length); 290 291 if(dst_type.sign && src_type.sign) { 292 /* Replicate the sign bit in the most significant bits */ 293 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), ""); 294 } 295 else 296 /* Most significant bits always zero */ 297 msb = lp_build_zero(gallivm, src_type); 298 299 /* Interleave bits */ 300 #ifdef PIPE_ARCH_LITTLE_ENDIAN 301 *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); 302 *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); 303 #else 304 *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0); 305 *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1); 306 #endif 307 308 /* Cast the result into the new type (twice as wide) */ 309 310 dst_vec_type = lp_build_vec_type(gallivm, dst_type); 311 312 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, ""); 313 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, ""); 314 } 315 316 317 /** 318 * Expand the bit width. 319 * 320 * This will only change the number of bits the values are represented, not the 321 * values themselves. 322 */ 323 void 324 lp_build_unpack(struct gallivm_state *gallivm, 325 struct lp_type src_type, 326 struct lp_type dst_type, 327 LLVMValueRef src, 328 LLVMValueRef *dst, unsigned num_dsts) 329 { 330 unsigned num_tmps; 331 unsigned i; 332 333 /* Register width must remain constant */ 334 assert(src_type.width * src_type.length == dst_type.width * dst_type.length); 335 336 /* We must not loose or gain channels. Only precision */ 337 assert(src_type.length == dst_type.length * num_dsts); 338 339 num_tmps = 1; 340 dst[0] = src; 341 342 while(src_type.width < dst_type.width) { 343 struct lp_type tmp_type = src_type; 344 345 tmp_type.width *= 2; 346 tmp_type.length /= 2; 347 348 for(i = num_tmps; i--; ) { 349 lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]); 350 } 351 352 src_type = tmp_type; 353 354 num_tmps *= 2; 355 } 356 357 assert(num_tmps == num_dsts); 358 } 359 360 361 /** 362 * Non-interleaved pack. 363 * 364 * This will move values as 365 * (LSB) (MSB) 366 * lo = l0 __ l1 __ l2 __.. __ ln __ 367 * hi = h0 __ h1 __ h2 __.. __ hn __ 368 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn 369 * 370 * This will only change the number of bits the values are represented, not the 371 * values themselves. 372 * 373 * It is assumed the values are already clamped into the destination type range. 374 * Values outside that range will produce undefined results. Use 375 * lp_build_packs2 instead. 376 */ 377 LLVMValueRef 378 lp_build_pack2(struct gallivm_state *gallivm, 379 struct lp_type src_type, 380 struct lp_type dst_type, 381 LLVMValueRef lo, 382 LLVMValueRef hi) 383 { 384 LLVMBuilderRef builder = gallivm->builder; 385 LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); 386 LLVMValueRef shuffle; 387 LLVMValueRef res = NULL; 388 struct lp_type intr_type = dst_type; 389 390 #if HAVE_LLVM < 0x0207 391 intr_type = src_type; 392 #endif 393 394 assert(!src_type.floating); 395 assert(!dst_type.floating); 396 assert(src_type.width == dst_type.width * 2); 397 assert(src_type.length * 2 == dst_type.length); 398 399 /* Check for special cases first */ 400 if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) { 401 const char *intrinsic = NULL; 402 403 switch(src_type.width) { 404 case 32: 405 if(dst_type.sign) { 406 intrinsic = "llvm.x86.sse2.packssdw.128"; 407 } 408 else { 409 if (util_cpu_caps.has_sse4_1) { 410 intrinsic = "llvm.x86.sse41.packusdw"; 411 #if HAVE_LLVM < 0x0207 412 /* llvm < 2.7 has inconsistent signatures except for packusdw */ 413 intr_type = dst_type; 414 #endif 415 } 416 } 417 break; 418 case 16: 419 if (dst_type.sign) { 420 intrinsic = "llvm.x86.sse2.packsswb.128"; 421 } 422 else { 423 intrinsic = "llvm.x86.sse2.packuswb.128"; 424 } 425 break; 426 /* default uses generic shuffle below */ 427 } 428 if (intrinsic) { 429 if (src_type.width * src_type.length == 128) { 430 LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); 431 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); 432 if (dst_vec_type != intr_vec_type) { 433 res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); 434 } 435 } 436 else { 437 int num_split = src_type.width * src_type.length / 128; 438 int i; 439 int nlen = 128 / src_type.width; 440 struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); 441 struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); 442 LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; 443 LLVMValueRef tmplo, tmphi; 444 LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); 445 LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); 446 447 assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); 448 449 for (i = 0; i < num_split / 2; i++) { 450 tmplo = lp_build_extract_range(gallivm, 451 lo, i*nlen*2, nlen); 452 tmphi = lp_build_extract_range(gallivm, 453 lo, i*nlen*2 + nlen, nlen); 454 tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, 455 nintr_vec_type, tmplo, tmphi); 456 if (ndst_vec_type != nintr_vec_type) { 457 tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); 458 } 459 } 460 for (i = 0; i < num_split / 2; i++) { 461 tmplo = lp_build_extract_range(gallivm, 462 hi, i*nlen*2, nlen); 463 tmphi = lp_build_extract_range(gallivm, 464 hi, i*nlen*2 + nlen, nlen); 465 tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, 466 nintr_vec_type, 467 tmplo, tmphi); 468 if (ndst_vec_type != nintr_vec_type) { 469 tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], 470 ndst_vec_type, ""); 471 } 472 } 473 res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); 474 } 475 return res; 476 } 477 } 478 479 /* generic shuffle */ 480 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); 481 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); 482 483 shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length); 484 485 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); 486 487 return res; 488 } 489 490 491 492 /** 493 * Non-interleaved pack and saturate. 494 * 495 * Same as lp_build_pack2 but will saturate values so that they fit into the 496 * destination type. 497 */ 498 LLVMValueRef 499 lp_build_packs2(struct gallivm_state *gallivm, 500 struct lp_type src_type, 501 struct lp_type dst_type, 502 LLVMValueRef lo, 503 LLVMValueRef hi) 504 { 505 boolean clamp; 506 507 assert(!src_type.floating); 508 assert(!dst_type.floating); 509 assert(src_type.sign == dst_type.sign); 510 assert(src_type.width == dst_type.width * 2); 511 assert(src_type.length * 2 == dst_type.length); 512 513 clamp = TRUE; 514 515 /* All X86 SSE non-interleaved pack instructions take signed inputs and 516 * saturate them, so no need to clamp for those cases. */ 517 if(util_cpu_caps.has_sse2 && 518 src_type.width * src_type.length >= 128 && 519 src_type.sign && 520 (src_type.width == 32 || src_type.width == 16)) 521 clamp = FALSE; 522 523 if(clamp) { 524 struct lp_build_context bld; 525 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width; 526 LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1); 527 lp_build_context_init(&bld, gallivm, src_type); 528 lo = lp_build_min(&bld, lo, dst_max); 529 hi = lp_build_min(&bld, hi, dst_max); 530 /* FIXME: What about lower bound? */ 531 } 532 533 return lp_build_pack2(gallivm, src_type, dst_type, lo, hi); 534 } 535 536 537 /** 538 * Truncate the bit width. 539 * 540 * TODO: Handle saturation consistently. 541 */ 542 LLVMValueRef 543 lp_build_pack(struct gallivm_state *gallivm, 544 struct lp_type src_type, 545 struct lp_type dst_type, 546 boolean clamped, 547 const LLVMValueRef *src, unsigned num_srcs) 548 { 549 LLVMValueRef (*pack2)(struct gallivm_state *gallivm, 550 struct lp_type src_type, 551 struct lp_type dst_type, 552 LLVMValueRef lo, 553 LLVMValueRef hi); 554 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 555 unsigned i; 556 557 /* Register width must remain constant */ 558 assert(src_type.width * src_type.length == dst_type.width * dst_type.length); 559 560 /* We must not loose or gain channels. Only precision */ 561 assert(src_type.length * num_srcs == dst_type.length); 562 563 if(clamped) 564 pack2 = &lp_build_pack2; 565 else 566 pack2 = &lp_build_packs2; 567 568 for(i = 0; i < num_srcs; ++i) 569 tmp[i] = src[i]; 570 571 while(src_type.width > dst_type.width) { 572 struct lp_type tmp_type = src_type; 573 574 tmp_type.width /= 2; 575 tmp_type.length *= 2; 576 577 /* Take in consideration the sign changes only in the last step */ 578 if(tmp_type.width == dst_type.width) 579 tmp_type.sign = dst_type.sign; 580 581 num_srcs /= 2; 582 583 for(i = 0; i < num_srcs; ++i) 584 tmp[i] = pack2(gallivm, src_type, tmp_type, 585 tmp[2*i + 0], tmp[2*i + 1]); 586 587 src_type = tmp_type; 588 } 589 590 assert(num_srcs == 1); 591 592 return tmp[0]; 593 } 594 595 596 /** 597 * Truncate or expand the bitwidth. 598 * 599 * NOTE: Getting the right sign flags is crucial here, as we employ some 600 * intrinsics that do saturation. 601 */ 602 void 603 lp_build_resize(struct gallivm_state *gallivm, 604 struct lp_type src_type, 605 struct lp_type dst_type, 606 const LLVMValueRef *src, unsigned num_srcs, 607 LLVMValueRef *dst, unsigned num_dsts) 608 { 609 LLVMBuilderRef builder = gallivm->builder; 610 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 611 unsigned i; 612 613 /* 614 * We don't support float <-> int conversion here. That must be done 615 * before/after calling this function. 616 */ 617 assert(src_type.floating == dst_type.floating); 618 619 /* 620 * We don't support double <-> float conversion yet, although it could be 621 * added with little effort. 622 */ 623 assert((!src_type.floating && !dst_type.floating) || 624 src_type.width == dst_type.width); 625 626 /* We must not loose or gain channels. Only precision */ 627 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 628 629 /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */ 630 assert(num_srcs == 1 || num_dsts == 1); 631 632 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 633 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 634 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 635 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 636 637 if (src_type.width > dst_type.width) { 638 /* 639 * Truncate bit width. 640 */ 641 642 assert(num_dsts == 1); 643 644 if (src_type.width * src_type.length == dst_type.width * dst_type.length) { 645 /* 646 * Register width remains constant -- use vector packing intrinsics 647 */ 648 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); 649 } 650 else { 651 if (src_type.width / dst_type.width > num_srcs) { 652 /* 653 * First change src vectors size (with shuffle) so they have the 654 * same size as the destination vector, then pack normally. 655 * Note: cannot use cast/extract because llvm generates atrocious code. 656 */ 657 unsigned size_ratio = (src_type.width * src_type.length) / 658 (dst_type.length * dst_type.width); 659 unsigned new_length = src_type.length / size_ratio; 660 661 for (i = 0; i < size_ratio * num_srcs; i++) { 662 unsigned start_index = (i % size_ratio) * new_length; 663 tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio], 664 start_index, new_length); 665 } 666 num_srcs *= size_ratio; 667 src_type.length = new_length; 668 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs); 669 } 670 else { 671 /* 672 * Truncate bit width but expand vector size - first pack 673 * then expand simply because this should be more AVX-friendly 674 * for the cases we probably hit. 675 */ 676 unsigned size_ratio = (dst_type.width * dst_type.length) / 677 (src_type.length * src_type.width); 678 unsigned num_pack_srcs = num_srcs / size_ratio; 679 dst_type.length = dst_type.length / size_ratio; 680 681 for (i = 0; i < size_ratio; i++) { 682 tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE, 683 &src[i*num_pack_srcs], num_pack_srcs); 684 } 685 tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio); 686 } 687 } 688 } 689 else if (src_type.width < dst_type.width) { 690 /* 691 * Expand bit width. 692 */ 693 694 assert(num_srcs == 1); 695 696 if (src_type.width * src_type.length == dst_type.width * dst_type.length) { 697 /* 698 * Register width remains constant -- use vector unpack intrinsics 699 */ 700 lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts); 701 } 702 else { 703 /* 704 * Do it element-wise. 705 */ 706 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 707 708 for (i = 0; i < num_dsts; i++) { 709 tmp[i] = lp_build_undef(gallivm, dst_type); 710 } 711 712 for (i = 0; i < src_type.length; ++i) { 713 unsigned j = i / dst_type.length; 714 LLVMValueRef srcindex = lp_build_const_int32(gallivm, i); 715 LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length); 716 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, ""); 717 718 if (src_type.sign && dst_type.sign) { 719 val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); 720 } else { 721 val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); 722 } 723 tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, ""); 724 } 725 } 726 } 727 else { 728 /* 729 * No-op 730 */ 731 732 assert(num_srcs == 1); 733 assert(num_dsts == 1); 734 735 tmp[0] = src[0]; 736 } 737 738 for(i = 0; i < num_dsts; ++i) 739 dst[i] = tmp[i]; 740 } 741 742 743 /** 744 * Expands src vector from src.length to dst_length 745 */ 746 LLVMValueRef 747 lp_build_pad_vector(struct gallivm_state *gallivm, 748 LLVMValueRef src, 749 struct lp_type src_type, 750 unsigned dst_length) 751 { 752 LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type)); 753 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 754 unsigned i; 755 756 assert(dst_length <= Elements(elems)); 757 assert(dst_length > src_type.length); 758 759 if (src_type.length == dst_length) 760 return src; 761 762 /* If its a single scalar type, no need to reinvent the wheel */ 763 if (src_type.length == 1) { 764 return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src); 765 } 766 767 /* All elements from src vector */ 768 for (i = 0; i < src_type.length; ++i) 769 elems[i] = lp_build_const_int32(gallivm, i); 770 771 /* Undef fill remaining space */ 772 for (i = src_type.length; i < dst_length; ++i) 773 elems[i] = lp_build_const_int32(gallivm, src_type.length); 774 775 /* Combine the two vectors */ 776 return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), ""); 777 } 778