1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 /** 29 * @file 30 * Helper functions for logical operations. 31 * 32 * @author Jose Fonseca <jfonseca (at) vmware.com> 33 */ 34 35 36 #include "util/u_cpu_detect.h" 37 #include "util/u_memory.h" 38 #include "util/u_debug.h" 39 40 #include "lp_bld_type.h" 41 #include "lp_bld_const.h" 42 #include "lp_bld_init.h" 43 #include "lp_bld_intr.h" 44 #include "lp_bld_debug.h" 45 #include "lp_bld_logic.h" 46 47 48 /* 49 * XXX 50 * 51 * Selection with vector conditional like 52 * 53 * select <4 x i1> %C, %A, %B 54 * 55 * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only 56 * supported on some backends (x86) starting with llvm 3.1. 57 * 58 * Expanding the boolean vector to full SIMD register width, as in 59 * 60 * sext <4 x i1> %C to <4 x i32> 61 * 62 * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but 63 * it causes assertion failures in LLVM 2.6. It appears to work correctly on 64 * LLVM 2.7. 65 */ 66 67 68 /** 69 * Build code to compare two values 'a' and 'b' of 'type' using the given func. 70 * \param func one of PIPE_FUNC_x 71 * The result values will be 0 for false or ~0 for true. 72 */ 73 LLVMValueRef 74 lp_build_compare(struct gallivm_state *gallivm, 75 const struct lp_type type, 76 unsigned func, 77 LLVMValueRef a, 78 LLVMValueRef b) 79 { 80 LLVMBuilderRef builder = gallivm->builder; 81 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type); 82 LLVMValueRef zeros = LLVMConstNull(int_vec_type); 83 LLVMValueRef ones = LLVMConstAllOnes(int_vec_type); 84 LLVMValueRef cond; 85 LLVMValueRef res; 86 87 assert(func >= PIPE_FUNC_NEVER); 88 assert(func <= PIPE_FUNC_ALWAYS); 89 assert(lp_check_value(type, a)); 90 assert(lp_check_value(type, b)); 91 92 if(func == PIPE_FUNC_NEVER) 93 return zeros; 94 if(func == PIPE_FUNC_ALWAYS) 95 return ones; 96 97 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) 98 /* 99 * There are no unsigned integer comparison instructions in SSE. 100 */ 101 102 if (!type.floating && !type.sign && 103 type.width * type.length == 128 && 104 util_cpu_caps.has_sse2 && 105 (func == PIPE_FUNC_LESS || 106 func == PIPE_FUNC_LEQUAL || 107 func == PIPE_FUNC_GREATER || 108 func == PIPE_FUNC_GEQUAL) && 109 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 110 debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", 111 __FUNCTION__, type.length, type.width); 112 } 113 #endif 114 115 #if HAVE_LLVM < 0x0207 116 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) 117 if(type.width * type.length == 128) { 118 if(type.floating && util_cpu_caps.has_sse) { 119 /* float[4] comparison */ 120 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); 121 LLVMValueRef args[3]; 122 unsigned cc; 123 boolean swap; 124 125 swap = FALSE; 126 switch(func) { 127 case PIPE_FUNC_EQUAL: 128 cc = 0; 129 break; 130 case PIPE_FUNC_NOTEQUAL: 131 cc = 4; 132 break; 133 case PIPE_FUNC_LESS: 134 cc = 1; 135 break; 136 case PIPE_FUNC_LEQUAL: 137 cc = 2; 138 break; 139 case PIPE_FUNC_GREATER: 140 cc = 1; 141 swap = TRUE; 142 break; 143 case PIPE_FUNC_GEQUAL: 144 cc = 2; 145 swap = TRUE; 146 break; 147 default: 148 assert(0); 149 return lp_build_undef(gallivm, type); 150 } 151 152 if(swap) { 153 args[0] = b; 154 args[1] = a; 155 } 156 else { 157 args[0] = a; 158 args[1] = b; 159 } 160 161 args[2] = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), cc, 0); 162 res = lp_build_intrinsic(builder, 163 "llvm.x86.sse.cmp.ps", 164 vec_type, 165 args, 3); 166 res = LLVMBuildBitCast(builder, res, int_vec_type, ""); 167 return res; 168 } 169 else if(util_cpu_caps.has_sse2) { 170 /* int[4] comparison */ 171 static const struct { 172 unsigned swap:1; 173 unsigned eq:1; 174 unsigned gt:1; 175 unsigned not:1; 176 } table[] = { 177 {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */ 178 {1, 0, 1, 0}, /* PIPE_FUNC_LESS */ 179 {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */ 180 {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */ 181 {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */ 182 {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */ 183 {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */ 184 {0, 0, 0, 0} /* PIPE_FUNC_ALWAYS */ 185 }; 186 const char *pcmpeq; 187 const char *pcmpgt; 188 LLVMValueRef args[2]; 189 LLVMValueRef res; 190 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); 191 192 switch (type.width) { 193 case 8: 194 pcmpeq = "llvm.x86.sse2.pcmpeq.b"; 195 pcmpgt = "llvm.x86.sse2.pcmpgt.b"; 196 break; 197 case 16: 198 pcmpeq = "llvm.x86.sse2.pcmpeq.w"; 199 pcmpgt = "llvm.x86.sse2.pcmpgt.w"; 200 break; 201 case 32: 202 pcmpeq = "llvm.x86.sse2.pcmpeq.d"; 203 pcmpgt = "llvm.x86.sse2.pcmpgt.d"; 204 break; 205 default: 206 assert(0); 207 return lp_build_undef(gallivm, type); 208 } 209 210 /* There are no unsigned comparison instructions. So flip the sign bit 211 * so that the results match. 212 */ 213 if (table[func].gt && !type.sign) { 214 LLVMValueRef msb = lp_build_const_int_vec(gallivm, type, (unsigned long long)1 << (type.width - 1)); 215 a = LLVMBuildXor(builder, a, msb, ""); 216 b = LLVMBuildXor(builder, b, msb, ""); 217 } 218 219 if(table[func].swap) { 220 args[0] = b; 221 args[1] = a; 222 } 223 else { 224 args[0] = a; 225 args[1] = b; 226 } 227 228 if(table[func].eq) 229 res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2); 230 else if (table[func].gt) 231 res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2); 232 else 233 res = LLVMConstNull(vec_type); 234 235 if(table[func].not) 236 res = LLVMBuildNot(builder, res, ""); 237 238 return res; 239 } 240 } /* if (type.width * type.length == 128) */ 241 #endif 242 #endif /* HAVE_LLVM < 0x0207 */ 243 244 /* XXX: It is not clear if we should use the ordered or unordered operators */ 245 246 if(type.floating) { 247 LLVMRealPredicate op; 248 switch(func) { 249 case PIPE_FUNC_NEVER: 250 op = LLVMRealPredicateFalse; 251 break; 252 case PIPE_FUNC_ALWAYS: 253 op = LLVMRealPredicateTrue; 254 break; 255 case PIPE_FUNC_EQUAL: 256 op = LLVMRealUEQ; 257 break; 258 case PIPE_FUNC_NOTEQUAL: 259 op = LLVMRealUNE; 260 break; 261 case PIPE_FUNC_LESS: 262 op = LLVMRealULT; 263 break; 264 case PIPE_FUNC_LEQUAL: 265 op = LLVMRealULE; 266 break; 267 case PIPE_FUNC_GREATER: 268 op = LLVMRealUGT; 269 break; 270 case PIPE_FUNC_GEQUAL: 271 op = LLVMRealUGE; 272 break; 273 default: 274 assert(0); 275 return lp_build_undef(gallivm, type); 276 } 277 278 #if HAVE_LLVM >= 0x0207 279 cond = LLVMBuildFCmp(builder, op, a, b, ""); 280 res = LLVMBuildSExt(builder, cond, int_vec_type, ""); 281 #else 282 if (type.length == 1) { 283 cond = LLVMBuildFCmp(builder, op, a, b, ""); 284 res = LLVMBuildSExt(builder, cond, int_vec_type, ""); 285 } 286 else { 287 unsigned i; 288 289 res = LLVMGetUndef(int_vec_type); 290 291 debug_printf("%s: warning: using slow element-wise float" 292 " vector comparison\n", __FUNCTION__); 293 for (i = 0; i < type.length; ++i) { 294 LLVMValueRef index = lp_build_const_int32(gallivm, i); 295 cond = LLVMBuildFCmp(builder, op, 296 LLVMBuildExtractElement(builder, a, index, ""), 297 LLVMBuildExtractElement(builder, b, index, ""), 298 ""); 299 cond = LLVMBuildSelect(builder, cond, 300 LLVMConstExtractElement(ones, index), 301 LLVMConstExtractElement(zeros, index), 302 ""); 303 res = LLVMBuildInsertElement(builder, res, cond, index, ""); 304 } 305 } 306 #endif 307 } 308 else { 309 LLVMIntPredicate op; 310 switch(func) { 311 case PIPE_FUNC_EQUAL: 312 op = LLVMIntEQ; 313 break; 314 case PIPE_FUNC_NOTEQUAL: 315 op = LLVMIntNE; 316 break; 317 case PIPE_FUNC_LESS: 318 op = type.sign ? LLVMIntSLT : LLVMIntULT; 319 break; 320 case PIPE_FUNC_LEQUAL: 321 op = type.sign ? LLVMIntSLE : LLVMIntULE; 322 break; 323 case PIPE_FUNC_GREATER: 324 op = type.sign ? LLVMIntSGT : LLVMIntUGT; 325 break; 326 case PIPE_FUNC_GEQUAL: 327 op = type.sign ? LLVMIntSGE : LLVMIntUGE; 328 break; 329 default: 330 assert(0); 331 return lp_build_undef(gallivm, type); 332 } 333 334 #if HAVE_LLVM >= 0x0207 335 cond = LLVMBuildICmp(builder, op, a, b, ""); 336 res = LLVMBuildSExt(builder, cond, int_vec_type, ""); 337 #else 338 if (type.length == 1) { 339 cond = LLVMBuildICmp(builder, op, a, b, ""); 340 res = LLVMBuildSExt(builder, cond, int_vec_type, ""); 341 } 342 else { 343 unsigned i; 344 345 res = LLVMGetUndef(int_vec_type); 346 347 if (gallivm_debug & GALLIVM_DEBUG_PERF) { 348 debug_printf("%s: using slow element-wise int" 349 " vector comparison\n", __FUNCTION__); 350 } 351 352 for(i = 0; i < type.length; ++i) { 353 LLVMValueRef index = lp_build_const_int32(gallivm, i); 354 cond = LLVMBuildICmp(builder, op, 355 LLVMBuildExtractElement(builder, a, index, ""), 356 LLVMBuildExtractElement(builder, b, index, ""), 357 ""); 358 cond = LLVMBuildSelect(builder, cond, 359 LLVMConstExtractElement(ones, index), 360 LLVMConstExtractElement(zeros, index), 361 ""); 362 res = LLVMBuildInsertElement(builder, res, cond, index, ""); 363 } 364 } 365 #endif 366 } 367 368 return res; 369 } 370 371 372 373 /** 374 * Build code to compare two values 'a' and 'b' using the given func. 375 * \param func one of PIPE_FUNC_x 376 * The result values will be 0 for false or ~0 for true. 377 */ 378 LLVMValueRef 379 lp_build_cmp(struct lp_build_context *bld, 380 unsigned func, 381 LLVMValueRef a, 382 LLVMValueRef b) 383 { 384 return lp_build_compare(bld->gallivm, bld->type, func, a, b); 385 } 386 387 388 /** 389 * Return (mask & a) | (~mask & b); 390 */ 391 LLVMValueRef 392 lp_build_select_bitwise(struct lp_build_context *bld, 393 LLVMValueRef mask, 394 LLVMValueRef a, 395 LLVMValueRef b) 396 { 397 LLVMBuilderRef builder = bld->gallivm->builder; 398 struct lp_type type = bld->type; 399 LLVMValueRef res; 400 401 assert(lp_check_value(type, a)); 402 assert(lp_check_value(type, b)); 403 404 if (a == b) { 405 return a; 406 } 407 408 if(type.floating) { 409 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 410 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 411 b = LLVMBuildBitCast(builder, b, int_vec_type, ""); 412 } 413 414 a = LLVMBuildAnd(builder, a, mask, ""); 415 416 /* This often gets translated to PANDN, but sometimes the NOT is 417 * pre-computed and stored in another constant. The best strategy depends 418 * on available registers, so it is not a big deal -- hopefully LLVM does 419 * the right decision attending the rest of the program. 420 */ 421 b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), ""); 422 423 res = LLVMBuildOr(builder, a, b, ""); 424 425 if(type.floating) { 426 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 427 res = LLVMBuildBitCast(builder, res, vec_type, ""); 428 } 429 430 return res; 431 } 432 433 434 /** 435 * Return mask ? a : b; 436 * 437 * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value 438 * will yield unpredictable results. 439 */ 440 LLVMValueRef 441 lp_build_select(struct lp_build_context *bld, 442 LLVMValueRef mask, 443 LLVMValueRef a, 444 LLVMValueRef b) 445 { 446 LLVMBuilderRef builder = bld->gallivm->builder; 447 LLVMContextRef lc = bld->gallivm->context; 448 struct lp_type type = bld->type; 449 LLVMValueRef res; 450 451 assert(lp_check_value(type, a)); 452 assert(lp_check_value(type, b)); 453 454 if(a == b) 455 return a; 456 457 if (type.length == 1) { 458 mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), ""); 459 res = LLVMBuildSelect(builder, mask, a, b, ""); 460 } 461 else if (0) { 462 /* Generate a vector select. 463 * 464 * XXX: Using vector selects would avoid emitting intrinsics, but they aren't 465 * properly supported yet. 466 * 467 * LLVM 3.0 includes experimental support provided the -promote-elements 468 * options is passed to LLVM's command line (e.g., via 469 * llvm::cl::ParseCommandLineOptions), but resulting code quality is much 470 * worse, probably because some optimization passes don't know how to 471 * handle vector selects. 472 * 473 * See also: 474 * - http://lists.cs.uiuc.edu/pipermail/llvmdev/2011-October/043659.html 475 */ 476 477 /* Convert the mask to a vector of booleans. 478 * XXX: There are two ways to do this. Decide what's best. 479 */ 480 if (1) { 481 LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length); 482 mask = LLVMBuildTrunc(builder, mask, bool_vec_type, ""); 483 } else { 484 mask = LLVMBuildICmp(builder, LLVMIntNE, mask, LLVMConstNull(bld->int_vec_type), ""); 485 } 486 res = LLVMBuildSelect(builder, mask, a, b, ""); 487 } 488 else if (((util_cpu_caps.has_sse4_1 && 489 type.width * type.length == 128) || 490 (util_cpu_caps.has_avx && 491 type.width * type.length == 256 && type.width >= 32)) && 492 !LLVMIsConstant(a) && 493 !LLVMIsConstant(b) && 494 !LLVMIsConstant(mask)) { 495 const char *intrinsic; 496 LLVMTypeRef arg_type; 497 LLVMValueRef args[3]; 498 499 /* 500 * There's only float blend in AVX but can just cast i32/i64 501 * to float. 502 */ 503 if (type.width * type.length == 256) { 504 if (type.width == 64) { 505 intrinsic = "llvm.x86.avx.blendv.pd.256"; 506 arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4); 507 } 508 else { 509 intrinsic = "llvm.x86.avx.blendv.ps.256"; 510 arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); 511 } 512 } 513 else if (type.floating && 514 type.width == 64) { 515 intrinsic = "llvm.x86.sse41.blendvpd"; 516 arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2); 517 } else if (type.floating && 518 type.width == 32) { 519 intrinsic = "llvm.x86.sse41.blendvps"; 520 arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4); 521 } else { 522 intrinsic = "llvm.x86.sse41.pblendvb"; 523 arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16); 524 } 525 526 if (arg_type != bld->int_vec_type) { 527 mask = LLVMBuildBitCast(builder, mask, arg_type, ""); 528 } 529 530 if (arg_type != bld->vec_type) { 531 a = LLVMBuildBitCast(builder, a, arg_type, ""); 532 b = LLVMBuildBitCast(builder, b, arg_type, ""); 533 } 534 535 args[0] = b; 536 args[1] = a; 537 args[2] = mask; 538 539 res = lp_build_intrinsic(builder, intrinsic, 540 arg_type, args, Elements(args)); 541 542 if (arg_type != bld->vec_type) { 543 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 544 } 545 } 546 else { 547 res = lp_build_select_bitwise(bld, mask, a, b); 548 } 549 550 return res; 551 } 552 553 554 /** 555 * Return mask ? a : b; 556 * 557 * mask is a TGSI_WRITEMASK_xxx. 558 */ 559 LLVMValueRef 560 lp_build_select_aos(struct lp_build_context *bld, 561 unsigned mask, 562 LLVMValueRef a, 563 LLVMValueRef b) 564 { 565 LLVMBuilderRef builder = bld->gallivm->builder; 566 const struct lp_type type = bld->type; 567 const unsigned n = type.length; 568 unsigned i, j; 569 570 assert((mask & ~0xf) == 0); 571 assert(lp_check_value(type, a)); 572 assert(lp_check_value(type, b)); 573 574 if(a == b) 575 return a; 576 if((mask & 0xf) == 0xf) 577 return a; 578 if((mask & 0xf) == 0x0) 579 return b; 580 if(a == bld->undef || b == bld->undef) 581 return bld->undef; 582 583 /* 584 * There are two major ways of accomplishing this: 585 * - with a shuffle 586 * - with a select 587 * 588 * The flip between these is empirical and might need to be adjusted. 589 */ 590 if (n <= 4) { 591 /* 592 * Shuffle. 593 */ 594 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); 595 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; 596 597 for(j = 0; j < n; j += 4) 598 for(i = 0; i < 4; ++i) 599 shuffles[j + i] = LLVMConstInt(elem_type, 600 (mask & (1 << i) ? 0 : n) + j + i, 601 0); 602 603 return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), ""); 604 } 605 else { 606 LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask); 607 return lp_build_select(bld, mask_vec, a, b); 608 } 609 } 610 611 612 /** 613 * Return (scalar-cast)val ? true : false; 614 */ 615 LLVMValueRef 616 lp_build_any_true_range(struct lp_build_context *bld, 617 unsigned real_length, 618 LLVMValueRef val) 619 { 620 LLVMBuilderRef builder = bld->gallivm->builder; 621 LLVMTypeRef scalar_type; 622 LLVMTypeRef true_type; 623 624 assert(real_length <= bld->type.length); 625 626 true_type = LLVMIntTypeInContext(bld->gallivm->context, 627 bld->type.width * real_length); 628 scalar_type = LLVMIntTypeInContext(bld->gallivm->context, 629 bld->type.width * bld->type.length); 630 val = LLVMBuildBitCast(builder, val, scalar_type, ""); 631 /* 632 * We're using always native types so we can use intrinsics. 633 * However, if we don't do per-element calculations, we must ensure 634 * the excess elements aren't used since they may contain garbage. 635 */ 636 if (real_length < bld->type.length) { 637 val = LLVMBuildTrunc(builder, val, true_type, ""); 638 } 639 return LLVMBuildICmp(builder, LLVMIntNE, 640 val, LLVMConstNull(true_type), ""); 641 } 642