1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 /** 30 * @file 31 * Helper functions for packing/unpacking. 32 * 33 * Pack/unpacking is necessary for conversion between types of different 34 * bit width. 35 * 36 * They are also commonly used when an computation needs higher 37 * precision for the intermediate values. For example, if one needs the 38 * function: 39 * 40 * c = compute(a, b); 41 * 42 * to use more precision for intermediate results then one should implement it 43 * as: 44 * 45 * LLVMValueRef 46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b) 47 * { 48 * struct lp_type wide_type = lp_wider_type(type); 49 * LLVMValueRef al, ah, bl, bh, cl, ch, c; 50 * 51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah); 52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh); 53 * 54 * cl = compute_half(al, bl); 55 * ch = compute_half(ah, bh); 56 * 57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch); 58 * 59 * return c; 60 * } 61 * 62 * where compute_half() would do the computation for half the elements with 63 * twice the precision. 64 * 65 * @author Jose Fonseca <jfonseca (at) vmware.com> 66 */ 67 68 69 #include "util/u_debug.h" 70 #include "util/u_math.h" 71 #include "util/u_cpu_detect.h" 72 #include "util/u_memory.h" 73 74 #include "lp_bld_type.h" 75 #include "lp_bld_const.h" 76 #include "lp_bld_init.h" 77 #include "lp_bld_intr.h" 78 #include "lp_bld_arit.h" 79 #include "lp_bld_pack.h" 80 #include "lp_bld_swizzle.h" 81 82 83 /** 84 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions. 85 */ 86 static LLVMValueRef 87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm, 88 unsigned n, unsigned lo_hi) 89 { 90 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 91 unsigned i, j; 92 93 assert(n <= LP_MAX_VECTOR_LENGTH); 94 assert(lo_hi < 2); 95 96 /* TODO: cache results in a static table */ 97 98 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) { 99 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); 100 elems[i + 1] = lp_build_const_int32(gallivm, n + j); 101 } 102 103 return LLVMConstVector(elems, n); 104 } 105 106 /** 107 * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack. 108 * See comment above lp_build_interleave2_half for more details. 109 */ 110 static LLVMValueRef 111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm, 112 unsigned n, unsigned lo_hi) 113 { 114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 115 unsigned i, j; 116 117 assert(n <= LP_MAX_VECTOR_LENGTH); 118 assert(lo_hi < 2); 119 120 for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) { 121 if (i == (n / 2)) 122 j += n / 4; 123 124 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); 125 elems[i + 1] = lp_build_const_int32(gallivm, n + j); 126 } 127 128 return LLVMConstVector(elems, n); 129 } 130 131 /** 132 * Build shuffle vectors that match PACKxx (SSE) instructions or 133 * VPERM (Altivec). 134 */ 135 static LLVMValueRef 136 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n) 137 { 138 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 139 unsigned i; 140 141 assert(n <= LP_MAX_VECTOR_LENGTH); 142 143 for(i = 0; i < n; ++i) 144 #ifdef PIPE_ARCH_LITTLE_ENDIAN 145 elems[i] = lp_build_const_int32(gallivm, 2*i); 146 #else 147 elems[i] = lp_build_const_int32(gallivm, 2*i+1); 148 #endif 149 150 return LLVMConstVector(elems, n); 151 } 152 153 /** 154 * Return a vector with elements src[start:start+size] 155 * Most useful for getting half the values out of a 256bit sized vector, 156 * otherwise may cause data rearrangement to happen. 157 */ 158 LLVMValueRef 159 lp_build_extract_range(struct gallivm_state *gallivm, 160 LLVMValueRef src, 161 unsigned start, 162 unsigned size) 163 { 164 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 165 unsigned i; 166 167 assert(size <= ARRAY_SIZE(elems)); 168 169 for (i = 0; i < size; ++i) 170 elems[i] = lp_build_const_int32(gallivm, i + start); 171 172 if (size == 1) { 173 return LLVMBuildExtractElement(gallivm->builder, src, elems[0], ""); 174 } 175 else { 176 return LLVMBuildShuffleVector(gallivm->builder, src, src, 177 LLVMConstVector(elems, size), ""); 178 } 179 } 180 181 /** 182 * Concatenates several (must be a power of 2) vectors (of same type) 183 * into a larger one. 184 * Most useful for building up a 256bit sized vector out of two 128bit ones. 185 */ 186 LLVMValueRef 187 lp_build_concat(struct gallivm_state *gallivm, 188 LLVMValueRef src[], 189 struct lp_type src_type, 190 unsigned num_vectors) 191 { 192 unsigned new_length, i; 193 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2]; 194 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; 195 196 assert(src_type.length * num_vectors <= ARRAY_SIZE(shuffles)); 197 assert(util_is_power_of_two(num_vectors)); 198 199 new_length = src_type.length; 200 201 for (i = 0; i < num_vectors; i++) 202 tmp[i] = src[i]; 203 204 while (num_vectors > 1) { 205 num_vectors >>= 1; 206 new_length <<= 1; 207 for (i = 0; i < new_length; i++) { 208 shuffles[i] = lp_build_const_int32(gallivm, i); 209 } 210 for (i = 0; i < num_vectors; i++) { 211 tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1], 212 LLVMConstVector(shuffles, new_length), ""); 213 } 214 } 215 216 return tmp[0]; 217 } 218 219 220 /** 221 * Combines vectors to reduce from num_srcs to num_dsts. 222 * Returns the number of src vectors concatenated in a single dst. 223 * 224 * num_srcs must be exactly divisible by num_dsts. 225 * 226 * e.g. For num_srcs = 4 and src = [x, y, z, w] 227 * num_dsts = 1 dst = [xyzw] return = 4 228 * num_dsts = 2 dst = [xy, zw] return = 2 229 */ 230 int 231 lp_build_concat_n(struct gallivm_state *gallivm, 232 struct lp_type src_type, 233 LLVMValueRef *src, 234 unsigned num_srcs, 235 LLVMValueRef *dst, 236 unsigned num_dsts) 237 { 238 int size = num_srcs / num_dsts; 239 unsigned i; 240 241 assert(num_srcs >= num_dsts); 242 assert((num_srcs % size) == 0); 243 244 if (num_srcs == num_dsts) { 245 for (i = 0; i < num_dsts; ++i) { 246 dst[i] = src[i]; 247 } 248 return 1; 249 } 250 251 for (i = 0; i < num_dsts; ++i) { 252 dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size); 253 } 254 255 return size; 256 } 257 258 259 /** 260 * Un-interleave vector. 261 * This will return a vector consisting of every second element 262 * (depending on lo_hi, beginning at 0 or 1). 263 * The returned vector size (elems and width) will only be half 264 * that of the source vector. 265 */ 266 LLVMValueRef 267 lp_build_uninterleave1(struct gallivm_state *gallivm, 268 unsigned num_elems, 269 LLVMValueRef a, 270 unsigned lo_hi) 271 { 272 LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH]; 273 unsigned i; 274 assert(num_elems <= LP_MAX_VECTOR_LENGTH); 275 276 for (i = 0; i < num_elems / 2; ++i) 277 elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi); 278 279 shuffle = LLVMConstVector(elems, num_elems / 2); 280 281 return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, ""); 282 } 283 284 285 /** 286 * Interleave vector elements. 287 * 288 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions 289 * (but not for 256bit AVX vectors). 290 */ 291 LLVMValueRef 292 lp_build_interleave2(struct gallivm_state *gallivm, 293 struct lp_type type, 294 LLVMValueRef a, 295 LLVMValueRef b, 296 unsigned lo_hi) 297 { 298 LLVMValueRef shuffle; 299 300 if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) { 301 /* 302 * XXX: This is a workaround for llvm code generation deficiency. Strangely 303 * enough, while this needs vinsertf128/vextractf128 instructions (hence 304 * a natural match when using 2x128bit vectors) the "normal" unpack shuffle 305 * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3). 306 * So use some different shuffles instead (the exact shuffles don't seem to 307 * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64). 308 */ 309 struct lp_type tmp_type = type; 310 LLVMValueRef srchalf[2], tmpdst; 311 tmp_type.length = 4; 312 tmp_type.width = 64; 313 a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), ""); 314 b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), ""); 315 srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2); 316 srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2); 317 tmp_type.length = 2; 318 tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2); 319 return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), ""); 320 } 321 322 shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi); 323 324 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); 325 } 326 327 /** 328 * Interleave vector elements but with 256 bit, 329 * treats it as interleave with 2 concatenated 128 bit vectors. 330 * 331 * This differs to lp_build_interleave2 as that function would do the following (for lo): 332 * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction. 333 * 334 * 335 * An example interleave 8x float with 8x float on AVX 256bit unpack: 336 * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7 337 * 338 * Equivalent to interleaving 2x 128 bit vectors 339 * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7 340 * 341 * So interleave-lo would result in: 342 * a0 b0 a1 b1 a4 b4 a5 b5 343 * 344 * And interleave-hi would result in: 345 * a2 b2 a3 b3 a6 b6 a7 b7 346 */ 347 LLVMValueRef 348 lp_build_interleave2_half(struct gallivm_state *gallivm, 349 struct lp_type type, 350 LLVMValueRef a, 351 LLVMValueRef b, 352 unsigned lo_hi) 353 { 354 if (type.length * type.width == 256) { 355 LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi); 356 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); 357 } else { 358 return lp_build_interleave2(gallivm, type, a, b, lo_hi); 359 } 360 } 361 362 363 /** 364 * Double the bit width. 365 * 366 * This will only change the number of bits the values are represented, not the 367 * values themselves. 368 * 369 */ 370 void 371 lp_build_unpack2(struct gallivm_state *gallivm, 372 struct lp_type src_type, 373 struct lp_type dst_type, 374 LLVMValueRef src, 375 LLVMValueRef *dst_lo, 376 LLVMValueRef *dst_hi) 377 { 378 LLVMBuilderRef builder = gallivm->builder; 379 LLVMValueRef msb; 380 LLVMTypeRef dst_vec_type; 381 382 assert(!src_type.floating); 383 assert(!dst_type.floating); 384 assert(dst_type.width == src_type.width * 2); 385 assert(dst_type.length * 2 == src_type.length); 386 387 if(dst_type.sign && src_type.sign) { 388 /* Replicate the sign bit in the most significant bits */ 389 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), ""); 390 } 391 else 392 /* Most significant bits always zero */ 393 msb = lp_build_zero(gallivm, src_type); 394 395 /* Interleave bits */ 396 #ifdef PIPE_ARCH_LITTLE_ENDIAN 397 *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); 398 *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); 399 400 #else 401 *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0); 402 *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1); 403 #endif 404 405 /* Cast the result into the new type (twice as wide) */ 406 407 dst_vec_type = lp_build_vec_type(gallivm, dst_type); 408 409 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, ""); 410 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, ""); 411 } 412 413 414 /** 415 * Double the bit width, with an order which fits the cpu nicely. 416 * 417 * This will only change the number of bits the values are represented, not the 418 * values themselves. 419 * 420 * The order of the results is not guaranteed, other than it will match 421 * the corresponding lp_build_pack2_native call. 422 */ 423 void 424 lp_build_unpack2_native(struct gallivm_state *gallivm, 425 struct lp_type src_type, 426 struct lp_type dst_type, 427 LLVMValueRef src, 428 LLVMValueRef *dst_lo, 429 LLVMValueRef *dst_hi) 430 { 431 LLVMBuilderRef builder = gallivm->builder; 432 LLVMValueRef msb; 433 LLVMTypeRef dst_vec_type; 434 435 assert(!src_type.floating); 436 assert(!dst_type.floating); 437 assert(dst_type.width == src_type.width * 2); 438 assert(dst_type.length * 2 == src_type.length); 439 440 if(dst_type.sign && src_type.sign) { 441 /* Replicate the sign bit in the most significant bits */ 442 msb = LLVMBuildAShr(builder, src, 443 lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), ""); 444 } 445 else 446 /* Most significant bits always zero */ 447 msb = lp_build_zero(gallivm, src_type); 448 449 /* Interleave bits */ 450 #ifdef PIPE_ARCH_LITTLE_ENDIAN 451 if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) { 452 *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0); 453 *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1); 454 } else { 455 *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); 456 *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); 457 } 458 #else 459 *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0); 460 *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1); 461 #endif 462 463 /* Cast the result into the new type (twice as wide) */ 464 465 dst_vec_type = lp_build_vec_type(gallivm, dst_type); 466 467 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, ""); 468 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, ""); 469 } 470 471 472 /** 473 * Expand the bit width. 474 * 475 * This will only change the number of bits the values are represented, not the 476 * values themselves. 477 */ 478 void 479 lp_build_unpack(struct gallivm_state *gallivm, 480 struct lp_type src_type, 481 struct lp_type dst_type, 482 LLVMValueRef src, 483 LLVMValueRef *dst, unsigned num_dsts) 484 { 485 unsigned num_tmps; 486 unsigned i; 487 488 /* Register width must remain constant */ 489 assert(src_type.width * src_type.length == dst_type.width * dst_type.length); 490 491 /* We must not loose or gain channels. Only precision */ 492 assert(src_type.length == dst_type.length * num_dsts); 493 494 num_tmps = 1; 495 dst[0] = src; 496 497 while(src_type.width < dst_type.width) { 498 struct lp_type tmp_type = src_type; 499 500 tmp_type.width *= 2; 501 tmp_type.length /= 2; 502 503 for(i = num_tmps; i--; ) { 504 lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], 505 &dst[2*i + 1]); 506 } 507 508 src_type = tmp_type; 509 510 num_tmps *= 2; 511 } 512 513 assert(num_tmps == num_dsts); 514 } 515 516 517 /** 518 * Non-interleaved pack. 519 * 520 * This will move values as 521 * (LSB) (MSB) 522 * lo = l0 __ l1 __ l2 __.. __ ln __ 523 * hi = h0 __ h1 __ h2 __.. __ hn __ 524 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn 525 * 526 * This will only change the number of bits the values are represented, not the 527 * values themselves. 528 * 529 * It is assumed the values are already clamped into the destination type range. 530 * Values outside that range will produce undefined results. Use 531 * lp_build_packs2 instead. 532 */ 533 LLVMValueRef 534 lp_build_pack2(struct gallivm_state *gallivm, 535 struct lp_type src_type, 536 struct lp_type dst_type, 537 LLVMValueRef lo, 538 LLVMValueRef hi) 539 { 540 LLVMBuilderRef builder = gallivm->builder; 541 LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); 542 LLVMValueRef shuffle; 543 LLVMValueRef res = NULL; 544 struct lp_type intr_type = dst_type; 545 546 assert(!src_type.floating); 547 assert(!dst_type.floating); 548 assert(src_type.width == dst_type.width * 2); 549 assert(src_type.length * 2 == dst_type.length); 550 551 /* Check for special cases first */ 552 if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) && 553 src_type.width * src_type.length >= 128) { 554 const char *intrinsic = NULL; 555 boolean swap_intrinsic_operands = FALSE; 556 557 switch(src_type.width) { 558 case 32: 559 if (util_cpu_caps.has_sse2) { 560 if (dst_type.sign) { 561 intrinsic = "llvm.x86.sse2.packssdw.128"; 562 } else { 563 if (util_cpu_caps.has_sse4_1) { 564 intrinsic = "llvm.x86.sse41.packusdw"; 565 } 566 } 567 } else if (util_cpu_caps.has_altivec) { 568 if (dst_type.sign) { 569 intrinsic = "llvm.ppc.altivec.vpkswss"; 570 } else { 571 intrinsic = "llvm.ppc.altivec.vpkuwus"; 572 } 573 #ifdef PIPE_ARCH_LITTLE_ENDIAN 574 swap_intrinsic_operands = TRUE; 575 #endif 576 } 577 break; 578 case 16: 579 if (dst_type.sign) { 580 if (util_cpu_caps.has_sse2) { 581 intrinsic = "llvm.x86.sse2.packsswb.128"; 582 } else if (util_cpu_caps.has_altivec) { 583 intrinsic = "llvm.ppc.altivec.vpkshss"; 584 #ifdef PIPE_ARCH_LITTLE_ENDIAN 585 swap_intrinsic_operands = TRUE; 586 #endif 587 } 588 } else { 589 if (util_cpu_caps.has_sse2) { 590 intrinsic = "llvm.x86.sse2.packuswb.128"; 591 } else if (util_cpu_caps.has_altivec) { 592 intrinsic = "llvm.ppc.altivec.vpkshus"; 593 #ifdef PIPE_ARCH_LITTLE_ENDIAN 594 swap_intrinsic_operands = TRUE; 595 #endif 596 } 597 } 598 break; 599 /* default uses generic shuffle below */ 600 } 601 if (intrinsic) { 602 if (src_type.width * src_type.length == 128) { 603 LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); 604 if (swap_intrinsic_operands) { 605 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo); 606 } else { 607 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); 608 } 609 if (dst_vec_type != intr_vec_type) { 610 res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); 611 } 612 } 613 else { 614 int num_split = src_type.width * src_type.length / 128; 615 int i; 616 int nlen = 128 / src_type.width; 617 int lo_off = swap_intrinsic_operands ? nlen : 0; 618 int hi_off = swap_intrinsic_operands ? 0 : nlen; 619 struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); 620 struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); 621 LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; 622 LLVMValueRef tmplo, tmphi; 623 LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); 624 LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); 625 626 assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); 627 628 for (i = 0; i < num_split / 2; i++) { 629 tmplo = lp_build_extract_range(gallivm, 630 lo, i*nlen*2 + lo_off, nlen); 631 tmphi = lp_build_extract_range(gallivm, 632 lo, i*nlen*2 + hi_off, nlen); 633 tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, 634 nintr_vec_type, tmplo, tmphi); 635 if (ndst_vec_type != nintr_vec_type) { 636 tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); 637 } 638 } 639 for (i = 0; i < num_split / 2; i++) { 640 tmplo = lp_build_extract_range(gallivm, 641 hi, i*nlen*2 + lo_off, nlen); 642 tmphi = lp_build_extract_range(gallivm, 643 hi, i*nlen*2 + hi_off, nlen); 644 tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, 645 nintr_vec_type, 646 tmplo, tmphi); 647 if (ndst_vec_type != nintr_vec_type) { 648 tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], 649 ndst_vec_type, ""); 650 } 651 } 652 res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); 653 } 654 return res; 655 } 656 } 657 658 /* generic shuffle */ 659 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); 660 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); 661 662 shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length); 663 664 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); 665 666 return res; 667 } 668 669 670 /** 671 * Non-interleaved native pack. 672 * 673 * Similar to lp_build_pack2, but the ordering of values is not 674 * guaranteed, other than it will match lp_build_unpack2_native. 675 * 676 * In particular, with avx2, the lower and upper 128bits of the vectors will 677 * be packed independently, so that (with 32bit->16bit values) 678 * (LSB) (MSB) 679 * lo = l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __ 680 * hi = h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __ 681 * res = l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7 682 * 683 * This will only change the number of bits the values are represented, not the 684 * values themselves. 685 * 686 * It is assumed the values are already clamped into the destination type range. 687 * Values outside that range will produce undefined results. 688 */ 689 LLVMValueRef 690 lp_build_pack2_native(struct gallivm_state *gallivm, 691 struct lp_type src_type, 692 struct lp_type dst_type, 693 LLVMValueRef lo, 694 LLVMValueRef hi) 695 { 696 LLVMBuilderRef builder = gallivm->builder; 697 struct lp_type intr_type = dst_type; 698 const char *intrinsic = NULL; 699 700 assert(!src_type.floating); 701 assert(!dst_type.floating); 702 assert(src_type.width == dst_type.width * 2); 703 assert(src_type.length * 2 == dst_type.length); 704 705 /* At this point only have special case for avx2 */ 706 if (src_type.length * src_type.width == 256 && 707 util_cpu_caps.has_avx2) { 708 switch(src_type.width) { 709 case 32: 710 if (dst_type.sign) { 711 intrinsic = "llvm.x86.avx2.packssdw"; 712 } else { 713 intrinsic = "llvm.x86.avx2.packusdw"; 714 } 715 break; 716 case 16: 717 if (dst_type.sign) { 718 intrinsic = "llvm.x86.avx2.packsswb"; 719 } else { 720 intrinsic = "llvm.x86.avx2.packuswb"; 721 } 722 break; 723 } 724 } 725 if (intrinsic) { 726 LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); 727 return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, 728 lo, hi); 729 } 730 else { 731 return lp_build_pack2(gallivm, src_type, dst_type, lo, hi); 732 } 733 } 734 735 /** 736 * Non-interleaved pack and saturate. 737 * 738 * Same as lp_build_pack2 but will saturate values so that they fit into the 739 * destination type. 740 */ 741 LLVMValueRef 742 lp_build_packs2(struct gallivm_state *gallivm, 743 struct lp_type src_type, 744 struct lp_type dst_type, 745 LLVMValueRef lo, 746 LLVMValueRef hi) 747 { 748 boolean clamp; 749 750 assert(!src_type.floating); 751 assert(!dst_type.floating); 752 assert(src_type.sign == dst_type.sign); 753 assert(src_type.width == dst_type.width * 2); 754 assert(src_type.length * 2 == dst_type.length); 755 756 clamp = TRUE; 757 758 /* All X86 SSE non-interleaved pack instructions take signed inputs and 759 * saturate them, so no need to clamp for those cases. */ 760 if(util_cpu_caps.has_sse2 && 761 src_type.width * src_type.length >= 128 && 762 src_type.sign && 763 (src_type.width == 32 || src_type.width == 16)) 764 clamp = FALSE; 765 766 if(clamp) { 767 struct lp_build_context bld; 768 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width; 769 LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, 770 ((unsigned long long)1 << dst_bits) - 1); 771 lp_build_context_init(&bld, gallivm, src_type); 772 lo = lp_build_min(&bld, lo, dst_max); 773 hi = lp_build_min(&bld, hi, dst_max); 774 /* FIXME: What about lower bound? */ 775 } 776 777 return lp_build_pack2(gallivm, src_type, dst_type, lo, hi); 778 } 779 780 781 /** 782 * Truncate the bit width. 783 * 784 * TODO: Handle saturation consistently. 785 */ 786 LLVMValueRef 787 lp_build_pack(struct gallivm_state *gallivm, 788 struct lp_type src_type, 789 struct lp_type dst_type, 790 boolean clamped, 791 const LLVMValueRef *src, unsigned num_srcs) 792 { 793 LLVMValueRef (*pack2)(struct gallivm_state *gallivm, 794 struct lp_type src_type, 795 struct lp_type dst_type, 796 LLVMValueRef lo, 797 LLVMValueRef hi); 798 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 799 unsigned i; 800 801 /* Register width must remain constant */ 802 assert(src_type.width * src_type.length == dst_type.width * dst_type.length); 803 804 /* We must not loose or gain channels. Only precision */ 805 assert(src_type.length * num_srcs == dst_type.length); 806 807 if(clamped) 808 pack2 = &lp_build_pack2; 809 else 810 pack2 = &lp_build_packs2; 811 812 for(i = 0; i < num_srcs; ++i) 813 tmp[i] = src[i]; 814 815 while(src_type.width > dst_type.width) { 816 struct lp_type tmp_type = src_type; 817 818 tmp_type.width /= 2; 819 tmp_type.length *= 2; 820 821 /* Take in consideration the sign changes only in the last step */ 822 if(tmp_type.width == dst_type.width) 823 tmp_type.sign = dst_type.sign; 824 825 num_srcs /= 2; 826 827 for(i = 0; i < num_srcs; ++i) 828 tmp[i] = pack2(gallivm, src_type, tmp_type, 829 tmp[2*i + 0], tmp[2*i + 1]); 830 831 src_type = tmp_type; 832 } 833 834 assert(num_srcs == 1); 835 836 return tmp[0]; 837 } 838 839 840 /** 841 * Truncate or expand the bitwidth. 842 * 843 * NOTE: Getting the right sign flags is crucial here, as we employ some 844 * intrinsics that do saturation. 845 */ 846 void 847 lp_build_resize(struct gallivm_state *gallivm, 848 struct lp_type src_type, 849 struct lp_type dst_type, 850 const LLVMValueRef *src, unsigned num_srcs, 851 LLVMValueRef *dst, unsigned num_dsts) 852 { 853 LLVMBuilderRef builder = gallivm->builder; 854 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 855 unsigned i; 856 857 /* 858 * We don't support float <-> int conversion here. That must be done 859 * before/after calling this function. 860 */ 861 assert(src_type.floating == dst_type.floating); 862 863 /* 864 * We don't support double <-> float conversion yet, although it could be 865 * added with little effort. 866 */ 867 assert((!src_type.floating && !dst_type.floating) || 868 src_type.width == dst_type.width); 869 870 /* We must not loose or gain channels. Only precision */ 871 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 872 873 assert(src_type.length <= LP_MAX_VECTOR_LENGTH); 874 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); 875 assert(num_srcs <= LP_MAX_VECTOR_LENGTH); 876 assert(num_dsts <= LP_MAX_VECTOR_LENGTH); 877 878 if (src_type.width > dst_type.width) { 879 /* 880 * Truncate bit width. 881 */ 882 883 /* Conversion must be M:1 */ 884 assert(num_dsts == 1); 885 886 if (src_type.width * src_type.length == dst_type.width * dst_type.length) { 887 /* 888 * Register width remains constant -- use vector packing intrinsics 889 */ 890 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); 891 } 892 else { 893 if (src_type.width / dst_type.width > num_srcs) { 894 /* 895 * First change src vectors size (with shuffle) so they have the 896 * same size as the destination vector, then pack normally. 897 * Note: cannot use cast/extract because llvm generates atrocious code. 898 */ 899 unsigned size_ratio = (src_type.width * src_type.length) / 900 (dst_type.length * dst_type.width); 901 unsigned new_length = src_type.length / size_ratio; 902 903 for (i = 0; i < size_ratio * num_srcs; i++) { 904 unsigned start_index = (i % size_ratio) * new_length; 905 tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio], 906 start_index, new_length); 907 } 908 num_srcs *= size_ratio; 909 src_type.length = new_length; 910 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs); 911 } 912 else { 913 /* 914 * Truncate bit width but expand vector size - first pack 915 * then expand simply because this should be more AVX-friendly 916 * for the cases we probably hit. 917 */ 918 unsigned size_ratio = (dst_type.width * dst_type.length) / 919 (src_type.length * src_type.width); 920 unsigned num_pack_srcs = num_srcs / size_ratio; 921 dst_type.length = dst_type.length / size_ratio; 922 923 for (i = 0; i < size_ratio; i++) { 924 tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE, 925 &src[i*num_pack_srcs], num_pack_srcs); 926 } 927 tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio); 928 } 929 } 930 } 931 else if (src_type.width < dst_type.width) { 932 /* 933 * Expand bit width. 934 */ 935 936 /* Conversion must be 1:N */ 937 assert(num_srcs == 1); 938 939 if (src_type.width * src_type.length == dst_type.width * dst_type.length) { 940 /* 941 * Register width remains constant -- use vector unpack intrinsics 942 */ 943 lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts); 944 } 945 else { 946 /* 947 * Do it element-wise. 948 */ 949 assert(src_type.length * num_srcs == dst_type.length * num_dsts); 950 951 for (i = 0; i < num_dsts; i++) { 952 tmp[i] = lp_build_undef(gallivm, dst_type); 953 } 954 955 for (i = 0; i < src_type.length; ++i) { 956 unsigned j = i / dst_type.length; 957 LLVMValueRef srcindex = lp_build_const_int32(gallivm, i); 958 LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length); 959 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, ""); 960 961 if (src_type.sign && dst_type.sign) { 962 val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); 963 } else { 964 val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); 965 } 966 tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, ""); 967 } 968 } 969 } 970 else { 971 /* 972 * No-op 973 */ 974 975 /* "Conversion" must be N:N */ 976 assert(num_srcs == num_dsts); 977 978 for(i = 0; i < num_dsts; ++i) 979 tmp[i] = src[i]; 980 } 981 982 for(i = 0; i < num_dsts; ++i) 983 dst[i] = tmp[i]; 984 } 985 986 987 /** 988 * Expands src vector from src.length to dst_length 989 */ 990 LLVMValueRef 991 lp_build_pad_vector(struct gallivm_state *gallivm, 992 LLVMValueRef src, 993 unsigned dst_length) 994 { 995 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; 996 LLVMValueRef undef; 997 LLVMTypeRef type; 998 unsigned i, src_length; 999 1000 type = LLVMTypeOf(src); 1001 1002 if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) { 1003 /* Can't use ShuffleVector on non-vector type */ 1004 undef = LLVMGetUndef(LLVMVectorType(type, dst_length)); 1005 return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), ""); 1006 } 1007 1008 undef = LLVMGetUndef(type); 1009 src_length = LLVMGetVectorSize(type); 1010 1011 assert(dst_length <= ARRAY_SIZE(elems)); 1012 assert(dst_length >= src_length); 1013 1014 if (src_length == dst_length) 1015 return src; 1016 1017 /* All elements from src vector */ 1018 for (i = 0; i < src_length; ++i) 1019 elems[i] = lp_build_const_int32(gallivm, i); 1020 1021 /* Undef fill remaining space */ 1022 for (i = src_length; i < dst_length; ++i) 1023 elems[i] = lp_build_const_int32(gallivm, src_length); 1024 1025 /* Combine the two vectors */ 1026 return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), ""); 1027 } 1028