1 /************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 /** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca (at) vmware.com> 45 */ 46 47 48 #include <float.h> 49 50 #include "util/u_memory.h" 51 #include "util/u_debug.h" 52 #include "util/u_math.h" 53 #include "util/u_cpu_detect.h" 54 55 #include "lp_bld_type.h" 56 #include "lp_bld_const.h" 57 #include "lp_bld_init.h" 58 #include "lp_bld_intr.h" 59 #include "lp_bld_logic.h" 60 #include "lp_bld_pack.h" 61 #include "lp_bld_debug.h" 62 #include "lp_bld_bitarit.h" 63 #include "lp_bld_arit.h" 64 #include "lp_bld_flow.h" 65 66 #if defined(PIPE_ARCH_SSE) 67 #include <xmmintrin.h> 68 #endif 69 70 #ifndef _MM_DENORMALS_ZERO_MASK 71 #define _MM_DENORMALS_ZERO_MASK 0x0040 72 #endif 73 74 #ifndef _MM_FLUSH_ZERO_MASK 75 #define _MM_FLUSH_ZERO_MASK 0x8000 76 #endif 77 78 #define EXP_POLY_DEGREE 5 79 80 #define LOG_POLY_DEGREE 4 81 82 83 /** 84 * Generate min(a, b) 85 * No checks for special case values of a or b = 1 or 0 are done. 86 * NaN's are handled according to the behavior specified by the 87 * nan_behavior argument. 88 */ 89 static LLVMValueRef 90 lp_build_min_simple(struct lp_build_context *bld, 91 LLVMValueRef a, 92 LLVMValueRef b, 93 enum gallivm_nan_behavior nan_behavior) 94 { 95 const struct lp_type type = bld->type; 96 const char *intrinsic = NULL; 97 unsigned intr_size = 0; 98 LLVMValueRef cond; 99 100 assert(lp_check_value(type, a)); 101 assert(lp_check_value(type, b)); 102 103 /* TODO: optimize the constant case */ 104 105 if (type.floating && util_cpu_caps.has_sse) { 106 if (type.width == 32) { 107 if (type.length == 1) { 108 intrinsic = "llvm.x86.sse.min.ss"; 109 intr_size = 128; 110 } 111 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 112 intrinsic = "llvm.x86.sse.min.ps"; 113 intr_size = 128; 114 } 115 else { 116 intrinsic = "llvm.x86.avx.min.ps.256"; 117 intr_size = 256; 118 } 119 } 120 if (type.width == 64 && util_cpu_caps.has_sse2) { 121 if (type.length == 1) { 122 intrinsic = "llvm.x86.sse2.min.sd"; 123 intr_size = 128; 124 } 125 else if (type.length == 2 || !util_cpu_caps.has_avx) { 126 intrinsic = "llvm.x86.sse2.min.pd"; 127 intr_size = 128; 128 } 129 else { 130 intrinsic = "llvm.x86.avx.min.pd.256"; 131 intr_size = 256; 132 } 133 } 134 } 135 else if (type.floating && util_cpu_caps.has_altivec) { 136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN || 137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 138 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 139 __FUNCTION__); 140 } 141 if (type.width == 32 && type.length == 4) { 142 intrinsic = "llvm.ppc.altivec.vminfp"; 143 intr_size = 128; 144 } 145 } else if (HAVE_LLVM < 0x0309 && 146 util_cpu_caps.has_avx2 && type.length > 4) { 147 intr_size = 256; 148 switch (type.width) { 149 case 8: 150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b"; 151 break; 152 case 16: 153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w"; 154 break; 155 case 32: 156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d"; 157 break; 158 } 159 } else if (HAVE_LLVM < 0x0309 && 160 util_cpu_caps.has_sse2 && type.length >= 2) { 161 intr_size = 128; 162 if ((type.width == 8 || type.width == 16) && 163 (type.width * type.length <= 64) && 164 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 166 __FUNCTION__); 167 } 168 if (type.width == 8 && !type.sign) { 169 intrinsic = "llvm.x86.sse2.pminu.b"; 170 } 171 else if (type.width == 16 && type.sign) { 172 intrinsic = "llvm.x86.sse2.pmins.w"; 173 } 174 if (util_cpu_caps.has_sse4_1) { 175 if (type.width == 8 && type.sign) { 176 intrinsic = "llvm.x86.sse41.pminsb"; 177 } 178 if (type.width == 16 && !type.sign) { 179 intrinsic = "llvm.x86.sse41.pminuw"; 180 } 181 if (type.width == 32 && !type.sign) { 182 intrinsic = "llvm.x86.sse41.pminud"; 183 } 184 if (type.width == 32 && type.sign) { 185 intrinsic = "llvm.x86.sse41.pminsd"; 186 } 187 } 188 } else if (util_cpu_caps.has_altivec) { 189 intr_size = 128; 190 if (type.width == 8) { 191 if (!type.sign) { 192 intrinsic = "llvm.ppc.altivec.vminub"; 193 } else { 194 intrinsic = "llvm.ppc.altivec.vminsb"; 195 } 196 } else if (type.width == 16) { 197 if (!type.sign) { 198 intrinsic = "llvm.ppc.altivec.vminuh"; 199 } else { 200 intrinsic = "llvm.ppc.altivec.vminsh"; 201 } 202 } else if (type.width == 32) { 203 if (!type.sign) { 204 intrinsic = "llvm.ppc.altivec.vminuw"; 205 } else { 206 intrinsic = "llvm.ppc.altivec.vminsw"; 207 } 208 } 209 } 210 211 if (intrinsic) { 212 /* We need to handle nan's for floating point numbers. If one of the 213 * inputs is nan the other should be returned (required by both D3D10+ 214 * and OpenCL). 215 * The sse intrinsics return the second operator in case of nan by 216 * default so we need to special code to handle those. 217 */ 218 if (util_cpu_caps.has_sse && type.floating && 219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && 220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && 221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 222 LLVMValueRef isnan, min; 223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 224 type, 225 intr_size, a, b); 226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 227 isnan = lp_build_isnan(bld, b); 228 return lp_build_select(bld, isnan, a, min); 229 } else { 230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN); 231 isnan = lp_build_isnan(bld, a); 232 return lp_build_select(bld, isnan, a, min); 233 } 234 } else { 235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 236 type, 237 intr_size, a, b); 238 } 239 } 240 241 if (type.floating) { 242 switch (nan_behavior) { 243 case GALLIVM_NAN_RETURN_NAN: { 244 LLVMValueRef isnan = lp_build_isnan(bld, b); 245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 247 return lp_build_select(bld, cond, a, b); 248 } 249 break; 250 case GALLIVM_NAN_RETURN_OTHER: { 251 LLVMValueRef isnan = lp_build_isnan(bld, a); 252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 254 return lp_build_select(bld, cond, a, b); 255 } 256 break; 257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b); 259 return lp_build_select(bld, cond, a, b); 260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a); 262 return lp_build_select(bld, cond, b, a); 263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 265 return lp_build_select(bld, cond, a, b); 266 break; 267 default: 268 assert(0); 269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 270 return lp_build_select(bld, cond, a, b); 271 } 272 } else { 273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 274 return lp_build_select(bld, cond, a, b); 275 } 276 } 277 278 279 LLVMValueRef 280 lp_build_fmuladd(LLVMBuilderRef builder, 281 LLVMValueRef a, 282 LLVMValueRef b, 283 LLVMValueRef c) 284 { 285 LLVMTypeRef type = LLVMTypeOf(a); 286 assert(type == LLVMTypeOf(b)); 287 assert(type == LLVMTypeOf(c)); 288 if (HAVE_LLVM < 0x0304) { 289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is 290 * not supported, and instead it falls-back to a C function. 291 */ 292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, ""); 293 } 294 char intrinsic[32]; 295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type); 296 LLVMValueRef args[] = { a, b, c }; 297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0); 298 } 299 300 301 /** 302 * Generate max(a, b) 303 * No checks for special case values of a or b = 1 or 0 are done. 304 * NaN's are handled according to the behavior specified by the 305 * nan_behavior argument. 306 */ 307 static LLVMValueRef 308 lp_build_max_simple(struct lp_build_context *bld, 309 LLVMValueRef a, 310 LLVMValueRef b, 311 enum gallivm_nan_behavior nan_behavior) 312 { 313 const struct lp_type type = bld->type; 314 const char *intrinsic = NULL; 315 unsigned intr_size = 0; 316 LLVMValueRef cond; 317 318 assert(lp_check_value(type, a)); 319 assert(lp_check_value(type, b)); 320 321 /* TODO: optimize the constant case */ 322 323 if (type.floating && util_cpu_caps.has_sse) { 324 if (type.width == 32) { 325 if (type.length == 1) { 326 intrinsic = "llvm.x86.sse.max.ss"; 327 intr_size = 128; 328 } 329 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 330 intrinsic = "llvm.x86.sse.max.ps"; 331 intr_size = 128; 332 } 333 else { 334 intrinsic = "llvm.x86.avx.max.ps.256"; 335 intr_size = 256; 336 } 337 } 338 if (type.width == 64 && util_cpu_caps.has_sse2) { 339 if (type.length == 1) { 340 intrinsic = "llvm.x86.sse2.max.sd"; 341 intr_size = 128; 342 } 343 else if (type.length == 2 || !util_cpu_caps.has_avx) { 344 intrinsic = "llvm.x86.sse2.max.pd"; 345 intr_size = 128; 346 } 347 else { 348 intrinsic = "llvm.x86.avx.max.pd.256"; 349 intr_size = 256; 350 } 351 } 352 } 353 else if (type.floating && util_cpu_caps.has_altivec) { 354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN || 355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 356 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 357 __FUNCTION__); 358 } 359 if (type.width == 32 || type.length == 4) { 360 intrinsic = "llvm.ppc.altivec.vmaxfp"; 361 intr_size = 128; 362 } 363 } else if (HAVE_LLVM < 0x0309 && 364 util_cpu_caps.has_avx2 && type.length > 4) { 365 intr_size = 256; 366 switch (type.width) { 367 case 8: 368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b"; 369 break; 370 case 16: 371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w"; 372 break; 373 case 32: 374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d"; 375 break; 376 } 377 } else if (HAVE_LLVM < 0x0309 && 378 util_cpu_caps.has_sse2 && type.length >= 2) { 379 intr_size = 128; 380 if ((type.width == 8 || type.width == 16) && 381 (type.width * type.length <= 64) && 382 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 384 __FUNCTION__); 385 } 386 if (type.width == 8 && !type.sign) { 387 intrinsic = "llvm.x86.sse2.pmaxu.b"; 388 intr_size = 128; 389 } 390 else if (type.width == 16 && type.sign) { 391 intrinsic = "llvm.x86.sse2.pmaxs.w"; 392 } 393 if (util_cpu_caps.has_sse4_1) { 394 if (type.width == 8 && type.sign) { 395 intrinsic = "llvm.x86.sse41.pmaxsb"; 396 } 397 if (type.width == 16 && !type.sign) { 398 intrinsic = "llvm.x86.sse41.pmaxuw"; 399 } 400 if (type.width == 32 && !type.sign) { 401 intrinsic = "llvm.x86.sse41.pmaxud"; 402 } 403 if (type.width == 32 && type.sign) { 404 intrinsic = "llvm.x86.sse41.pmaxsd"; 405 } 406 } 407 } else if (util_cpu_caps.has_altivec) { 408 intr_size = 128; 409 if (type.width == 8) { 410 if (!type.sign) { 411 intrinsic = "llvm.ppc.altivec.vmaxub"; 412 } else { 413 intrinsic = "llvm.ppc.altivec.vmaxsb"; 414 } 415 } else if (type.width == 16) { 416 if (!type.sign) { 417 intrinsic = "llvm.ppc.altivec.vmaxuh"; 418 } else { 419 intrinsic = "llvm.ppc.altivec.vmaxsh"; 420 } 421 } else if (type.width == 32) { 422 if (!type.sign) { 423 intrinsic = "llvm.ppc.altivec.vmaxuw"; 424 } else { 425 intrinsic = "llvm.ppc.altivec.vmaxsw"; 426 } 427 } 428 } 429 430 if (intrinsic) { 431 if (util_cpu_caps.has_sse && type.floating && 432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && 433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && 434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 435 LLVMValueRef isnan, max; 436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 437 type, 438 intr_size, a, b); 439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 440 isnan = lp_build_isnan(bld, b); 441 return lp_build_select(bld, isnan, a, max); 442 } else { 443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN); 444 isnan = lp_build_isnan(bld, a); 445 return lp_build_select(bld, isnan, a, max); 446 } 447 } else { 448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 449 type, 450 intr_size, a, b); 451 } 452 } 453 454 if (type.floating) { 455 switch (nan_behavior) { 456 case GALLIVM_NAN_RETURN_NAN: { 457 LLVMValueRef isnan = lp_build_isnan(bld, b); 458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 460 return lp_build_select(bld, cond, a, b); 461 } 462 break; 463 case GALLIVM_NAN_RETURN_OTHER: { 464 LLVMValueRef isnan = lp_build_isnan(bld, a); 465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 467 return lp_build_select(bld, cond, a, b); 468 } 469 break; 470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b); 472 return lp_build_select(bld, cond, a, b); 473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a); 475 return lp_build_select(bld, cond, b, a); 476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 478 return lp_build_select(bld, cond, a, b); 479 break; 480 default: 481 assert(0); 482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 483 return lp_build_select(bld, cond, a, b); 484 } 485 } else { 486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 487 return lp_build_select(bld, cond, a, b); 488 } 489 } 490 491 492 /** 493 * Generate 1 - a, or ~a depending on bld->type. 494 */ 495 LLVMValueRef 496 lp_build_comp(struct lp_build_context *bld, 497 LLVMValueRef a) 498 { 499 LLVMBuilderRef builder = bld->gallivm->builder; 500 const struct lp_type type = bld->type; 501 502 assert(lp_check_value(type, a)); 503 504 if(a == bld->one) 505 return bld->zero; 506 if(a == bld->zero) 507 return bld->one; 508 509 if(type.norm && !type.floating && !type.fixed && !type.sign) { 510 if(LLVMIsConstant(a)) 511 return LLVMConstNot(a); 512 else 513 return LLVMBuildNot(builder, a, ""); 514 } 515 516 if(LLVMIsConstant(a)) 517 if (type.floating) 518 return LLVMConstFSub(bld->one, a); 519 else 520 return LLVMConstSub(bld->one, a); 521 else 522 if (type.floating) 523 return LLVMBuildFSub(builder, bld->one, a, ""); 524 else 525 return LLVMBuildSub(builder, bld->one, a, ""); 526 } 527 528 529 /** 530 * Generate a + b 531 */ 532 LLVMValueRef 533 lp_build_add(struct lp_build_context *bld, 534 LLVMValueRef a, 535 LLVMValueRef b) 536 { 537 LLVMBuilderRef builder = bld->gallivm->builder; 538 const struct lp_type type = bld->type; 539 LLVMValueRef res; 540 541 assert(lp_check_value(type, a)); 542 assert(lp_check_value(type, b)); 543 544 if (a == bld->zero) 545 return b; 546 if (b == bld->zero) 547 return a; 548 if (a == bld->undef || b == bld->undef) 549 return bld->undef; 550 551 if (type.norm) { 552 const char *intrinsic = NULL; 553 554 if (!type.sign && (a == bld->one || b == bld->one)) 555 return bld->one; 556 557 if (!type.floating && !type.fixed) { 558 if (type.width * type.length == 128) { 559 if (util_cpu_caps.has_sse2) { 560 if (type.width == 8) 561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 562 if (type.width == 16) 563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 564 } else if (util_cpu_caps.has_altivec) { 565 if (type.width == 8) 566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; 567 if (type.width == 16) 568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; 569 } 570 } 571 if (type.width * type.length == 256) { 572 if (util_cpu_caps.has_avx2) { 573 if (type.width == 8) 574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; 575 if (type.width == 16) 576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; 577 } 578 } 579 } 580 581 if (intrinsic) 582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 583 } 584 585 if(type.norm && !type.floating && !type.fixed) { 586 if (type.sign) { 587 uint64_t sign = (uint64_t)1 << (type.width - 1); 588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 590 /* a_clamp_max is the maximum a for positive b, 591 a_clamp_min is the minimum a for negative b. */ 592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min); 595 } else { 596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 597 } 598 } 599 600 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 601 if (type.floating) 602 res = LLVMConstFAdd(a, b); 603 else 604 res = LLVMConstAdd(a, b); 605 else 606 if (type.floating) 607 res = LLVMBuildFAdd(builder, a, b, ""); 608 else 609 res = LLVMBuildAdd(builder, a, b, ""); 610 611 /* clamp to ceiling of 1.0 */ 612 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 614 615 /* XXX clamp to floor of -1 or 0??? */ 616 617 return res; 618 } 619 620 621 /** Return the scalar sum of the elements of a. 622 * Should avoid this operation whenever possible. 623 */ 624 LLVMValueRef 625 lp_build_horizontal_add(struct lp_build_context *bld, 626 LLVMValueRef a) 627 { 628 LLVMBuilderRef builder = bld->gallivm->builder; 629 const struct lp_type type = bld->type; 630 LLVMValueRef index, res; 631 unsigned i, length; 632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 634 LLVMValueRef vecres, elem2; 635 636 assert(lp_check_value(type, a)); 637 638 if (type.length == 1) { 639 return a; 640 } 641 642 assert(!bld->type.norm); 643 644 /* 645 * for byte vectors can do much better with psadbw. 646 * Using repeated shuffle/adds here. Note with multiple vectors 647 * this can be done more efficiently as outlined in the intel 648 * optimization manual. 649 * Note: could cause data rearrangement if used with smaller element 650 * sizes. 651 */ 652 653 vecres = a; 654 length = type.length / 2; 655 while (length > 1) { 656 LLVMValueRef vec1, vec2; 657 for (i = 0; i < length; i++) { 658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 660 } 661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 662 LLVMConstVector(shuffles1, length), ""); 663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 664 LLVMConstVector(shuffles2, length), ""); 665 if (type.floating) { 666 vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 667 } 668 else { 669 vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 670 } 671 length = length >> 1; 672 } 673 674 /* always have vector of size 2 here */ 675 assert(length == 1); 676 677 index = lp_build_const_int32(bld->gallivm, 0); 678 res = LLVMBuildExtractElement(builder, vecres, index, ""); 679 index = lp_build_const_int32(bld->gallivm, 1); 680 elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 681 682 if (type.floating) 683 res = LLVMBuildFAdd(builder, res, elem2, ""); 684 else 685 res = LLVMBuildAdd(builder, res, elem2, ""); 686 687 return res; 688 } 689 690 /** 691 * Return the horizontal sums of 4 float vectors as a float4 vector. 692 * This uses the technique as outlined in Intel Optimization Manual. 693 */ 694 static LLVMValueRef 695 lp_build_horizontal_add4x4f(struct lp_build_context *bld, 696 LLVMValueRef src[4]) 697 { 698 struct gallivm_state *gallivm = bld->gallivm; 699 LLVMBuilderRef builder = gallivm->builder; 700 LLVMValueRef shuffles[4]; 701 LLVMValueRef tmp[4]; 702 LLVMValueRef sumtmp[2], shuftmp[2]; 703 704 /* lower half of regs */ 705 shuffles[0] = lp_build_const_int32(gallivm, 0); 706 shuffles[1] = lp_build_const_int32(gallivm, 1); 707 shuffles[2] = lp_build_const_int32(gallivm, 4); 708 shuffles[3] = lp_build_const_int32(gallivm, 5); 709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 710 LLVMConstVector(shuffles, 4), ""); 711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 712 LLVMConstVector(shuffles, 4), ""); 713 714 /* upper half of regs */ 715 shuffles[0] = lp_build_const_int32(gallivm, 2); 716 shuffles[1] = lp_build_const_int32(gallivm, 3); 717 shuffles[2] = lp_build_const_int32(gallivm, 6); 718 shuffles[3] = lp_build_const_int32(gallivm, 7); 719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 720 LLVMConstVector(shuffles, 4), ""); 721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 722 LLVMConstVector(shuffles, 4), ""); 723 724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 726 727 shuffles[0] = lp_build_const_int32(gallivm, 0); 728 shuffles[1] = lp_build_const_int32(gallivm, 2); 729 shuffles[2] = lp_build_const_int32(gallivm, 4); 730 shuffles[3] = lp_build_const_int32(gallivm, 6); 731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 732 LLVMConstVector(shuffles, 4), ""); 733 734 shuffles[0] = lp_build_const_int32(gallivm, 1); 735 shuffles[1] = lp_build_const_int32(gallivm, 3); 736 shuffles[2] = lp_build_const_int32(gallivm, 5); 737 shuffles[3] = lp_build_const_int32(gallivm, 7); 738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 739 LLVMConstVector(shuffles, 4), ""); 740 741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 742 } 743 744 745 /* 746 * partially horizontally add 2-4 float vectors with length nx4, 747 * i.e. only four adjacent values in each vector will be added, 748 * assuming values are really grouped in 4 which also determines 749 * output order. 750 * 751 * Return a vector of the same length as the initial vectors, 752 * with the excess elements (if any) being undefined. 753 * The element order is independent of number of input vectors. 754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 755 * the output order thus will be 756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 757 */ 758 LLVMValueRef 759 lp_build_hadd_partial4(struct lp_build_context *bld, 760 LLVMValueRef vectors[], 761 unsigned num_vecs) 762 { 763 struct gallivm_state *gallivm = bld->gallivm; 764 LLVMBuilderRef builder = gallivm->builder; 765 LLVMValueRef ret_vec; 766 LLVMValueRef tmp[4]; 767 const char *intrinsic = NULL; 768 769 assert(num_vecs >= 2 && num_vecs <= 4); 770 assert(bld->type.floating); 771 772 /* only use this with at least 2 vectors, as it is sort of expensive 773 * (depending on cpu) and we always need two horizontal adds anyway, 774 * so a shuffle/add approach might be better. 775 */ 776 777 tmp[0] = vectors[0]; 778 tmp[1] = vectors[1]; 779 780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 782 783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 && 784 bld->type.length == 4) { 785 intrinsic = "llvm.x86.sse3.hadd.ps"; 786 } 787 else if (util_cpu_caps.has_avx && bld->type.width == 32 && 788 bld->type.length == 8) { 789 intrinsic = "llvm.x86.avx.hadd.ps.256"; 790 } 791 if (intrinsic) { 792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 793 lp_build_vec_type(gallivm, bld->type), 794 tmp[0], tmp[1]); 795 if (num_vecs > 2) { 796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 797 lp_build_vec_type(gallivm, bld->type), 798 tmp[2], tmp[3]); 799 } 800 else { 801 tmp[1] = tmp[0]; 802 } 803 return lp_build_intrinsic_binary(builder, intrinsic, 804 lp_build_vec_type(gallivm, bld->type), 805 tmp[0], tmp[1]); 806 } 807 808 if (bld->type.length == 4) { 809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 810 } 811 else { 812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 813 unsigned j; 814 unsigned num_iter = bld->type.length / 4; 815 struct lp_type parttype = bld->type; 816 parttype.length = 4; 817 for (j = 0; j < num_iter; j++) { 818 LLVMValueRef partsrc[4]; 819 unsigned i; 820 for (i = 0; i < 4; i++) { 821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 822 } 823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 824 } 825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 826 } 827 return ret_vec; 828 } 829 830 /** 831 * Generate a - b 832 */ 833 LLVMValueRef 834 lp_build_sub(struct lp_build_context *bld, 835 LLVMValueRef a, 836 LLVMValueRef b) 837 { 838 LLVMBuilderRef builder = bld->gallivm->builder; 839 const struct lp_type type = bld->type; 840 LLVMValueRef res; 841 842 assert(lp_check_value(type, a)); 843 assert(lp_check_value(type, b)); 844 845 if (b == bld->zero) 846 return a; 847 if (a == bld->undef || b == bld->undef) 848 return bld->undef; 849 if (a == b) 850 return bld->zero; 851 852 if (type.norm) { 853 const char *intrinsic = NULL; 854 855 if (!type.sign && b == bld->one) 856 return bld->zero; 857 858 if (!type.floating && !type.fixed) { 859 if (type.width * type.length == 128) { 860 if (util_cpu_caps.has_sse2) { 861 if (type.width == 8) 862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 863 if (type.width == 16) 864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 865 } else if (util_cpu_caps.has_altivec) { 866 if (type.width == 8) 867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; 868 if (type.width == 16) 869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; 870 } 871 } 872 if (type.width * type.length == 256) { 873 if (util_cpu_caps.has_avx2) { 874 if (type.width == 8) 875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; 876 if (type.width == 16) 877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; 878 } 879 } 880 } 881 882 if (intrinsic) 883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 884 } 885 886 if(type.norm && !type.floating && !type.fixed) { 887 if (type.sign) { 888 uint64_t sign = (uint64_t)1 << (type.width - 1); 889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 891 /* a_clamp_max is the maximum a for negative b, 892 a_clamp_min is the minimum a for positive b. */ 893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max); 896 } else { 897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 898 } 899 } 900 901 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 902 if (type.floating) 903 res = LLVMConstFSub(a, b); 904 else 905 res = LLVMConstSub(a, b); 906 else 907 if (type.floating) 908 res = LLVMBuildFSub(builder, a, b, ""); 909 else 910 res = LLVMBuildSub(builder, a, b, ""); 911 912 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 914 915 return res; 916 } 917 918 919 920 /** 921 * Normalized multiplication. 922 * 923 * There are several approaches for (using 8-bit normalized multiplication as 924 * an example): 925 * 926 * - alpha plus one 927 * 928 * makes the following approximation to the division (Sree) 929 * 930 * a*b/255 ~= (a*(b + 1)) >> 256 931 * 932 * which is the fastest method that satisfies the following OpenGL criteria of 933 * 934 * 0*0 = 0 and 255*255 = 255 935 * 936 * - geometric series 937 * 938 * takes the geometric series approximation to the division 939 * 940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 941 * 942 * in this case just the first two terms to fit in 16bit arithmetic 943 * 944 * t/255 ~= (t + (t >> 8)) >> 8 945 * 946 * note that just by itself it doesn't satisfies the OpenGL criteria, as 947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 948 * must be used. 949 * 950 * - geometric series plus rounding 951 * 952 * when using a geometric series division instead of truncating the result 953 * use roundoff in the approximation (Jim Blinn) 954 * 955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 956 * 957 * achieving the exact results. 958 * 959 * 960 * 961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 963 * @sa Michael Herf, The "double blend trick", May 2000, 964 * http://www.stereopsis.com/doubleblend.html 965 */ 966 LLVMValueRef 967 lp_build_mul_norm(struct gallivm_state *gallivm, 968 struct lp_type wide_type, 969 LLVMValueRef a, LLVMValueRef b) 970 { 971 LLVMBuilderRef builder = gallivm->builder; 972 struct lp_build_context bld; 973 unsigned n; 974 LLVMValueRef half; 975 LLVMValueRef ab; 976 977 assert(!wide_type.floating); 978 assert(lp_check_value(wide_type, a)); 979 assert(lp_check_value(wide_type, b)); 980 981 lp_build_context_init(&bld, gallivm, wide_type); 982 983 n = wide_type.width / 2; 984 if (wide_type.sign) { 985 --n; 986 } 987 988 /* 989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW 990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/ 991 */ 992 993 /* 994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n 995 */ 996 997 ab = LLVMBuildMul(builder, a, b, ""); 998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), ""); 999 1000 /* 1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1)) 1002 */ 1003 1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1)); 1005 if (wide_type.sign) { 1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, ""); 1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1); 1008 half = lp_build_select(&bld, sign, minus_half, half); 1009 } 1010 ab = LLVMBuildAdd(builder, ab, half, ""); 1011 1012 /* Final division */ 1013 ab = lp_build_shr_imm(&bld, ab, n); 1014 1015 return ab; 1016 } 1017 1018 /** 1019 * Generate a * b 1020 */ 1021 LLVMValueRef 1022 lp_build_mul(struct lp_build_context *bld, 1023 LLVMValueRef a, 1024 LLVMValueRef b) 1025 { 1026 LLVMBuilderRef builder = bld->gallivm->builder; 1027 const struct lp_type type = bld->type; 1028 LLVMValueRef shift; 1029 LLVMValueRef res; 1030 1031 assert(lp_check_value(type, a)); 1032 assert(lp_check_value(type, b)); 1033 1034 if(a == bld->zero) 1035 return bld->zero; 1036 if(a == bld->one) 1037 return b; 1038 if(b == bld->zero) 1039 return bld->zero; 1040 if(b == bld->one) 1041 return a; 1042 if(a == bld->undef || b == bld->undef) 1043 return bld->undef; 1044 1045 if (!type.floating && !type.fixed && type.norm) { 1046 struct lp_type wide_type = lp_wider_type(type); 1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 1048 1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); 1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh); 1051 1052 /* PMULLW, PSRLW, PADDW */ 1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); 1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); 1055 1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh); 1057 1058 return ab; 1059 } 1060 1061 if(type.fixed) 1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 1063 else 1064 shift = NULL; 1065 1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1067 if (type.floating) 1068 res = LLVMConstFMul(a, b); 1069 else 1070 res = LLVMConstMul(a, b); 1071 if(shift) { 1072 if(type.sign) 1073 res = LLVMConstAShr(res, shift); 1074 else 1075 res = LLVMConstLShr(res, shift); 1076 } 1077 } 1078 else { 1079 if (type.floating) 1080 res = LLVMBuildFMul(builder, a, b, ""); 1081 else 1082 res = LLVMBuildMul(builder, a, b, ""); 1083 if(shift) { 1084 if(type.sign) 1085 res = LLVMBuildAShr(builder, res, shift, ""); 1086 else 1087 res = LLVMBuildLShr(builder, res, shift, ""); 1088 } 1089 } 1090 1091 return res; 1092 } 1093 1094 /* 1095 * Widening mul, valid for 32x32 bit -> 64bit only. 1096 * Result is low 32bits, high bits returned in res_hi. 1097 * 1098 * Emits code that is meant to be compiled for the host CPU. 1099 */ 1100 LLVMValueRef 1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, 1102 LLVMValueRef a, 1103 LLVMValueRef b, 1104 LLVMValueRef *res_hi) 1105 { 1106 struct gallivm_state *gallivm = bld->gallivm; 1107 LLVMBuilderRef builder = gallivm->builder; 1108 1109 assert(bld->type.width == 32); 1110 assert(bld->type.floating == 0); 1111 assert(bld->type.fixed == 0); 1112 assert(bld->type.norm == 0); 1113 1114 /* 1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces 1116 * for x86 simd is atrocious (even if the high bits weren't required), 1117 * trying to handle real 64bit inputs (which of course can't happen due 1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but 1119 * apparently llvm does not recognize this widening mul). This includes 6 1120 * (instead of 2) pmuludq plus extra adds and shifts 1121 * The same story applies to signed mul, albeit fixing this requires sse41. 1122 * https://llvm.org/bugs/show_bug.cgi?id=30845 1123 * So, whip up our own code, albeit only for length 4 and 8 (which 1124 * should be good enough)... 1125 */ 1126 if ((bld->type.length == 4 || bld->type.length == 8) && 1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || 1128 util_cpu_caps.has_sse4_1)) { 1129 const char *intrinsic = NULL; 1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; 1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; 1132 struct lp_type type_wide = lp_wider_type(bld->type); 1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide); 1134 unsigned i; 1135 for (i = 0; i < bld->type.length; i += 2) { 1136 shuf[i] = lp_build_const_int32(gallivm, i+1); 1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1138 } 1139 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1140 aeven = a; 1141 beven = b; 1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); 1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); 1144 1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) { 1146 if (bld->type.sign) { 1147 intrinsic = "llvm.x86.avx2.pmul.dq"; 1148 } else { 1149 intrinsic = "llvm.x86.avx2.pmulu.dq"; 1150 } 1151 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1152 wider_type, aeven, beven); 1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1154 wider_type, aodd, bodd); 1155 } 1156 else { 1157 /* for consistent naming look elsewhere... */ 1158 if (bld->type.sign) { 1159 intrinsic = "llvm.x86.sse41.pmuldq"; 1160 } else { 1161 intrinsic = "llvm.x86.sse2.pmulu.dq"; 1162 } 1163 /* 1164 * XXX If we only have AVX but not AVX2 this is a pain. 1165 * lp_build_intrinsic_binary_anylength() can't handle it 1166 * (due to src and dst type not being identical). 1167 */ 1168 if (bld->type.length == 8) { 1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi; 1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi; 1171 LLVMValueRef muleven2[2], mulodd2[2]; 1172 struct lp_type type_wide_half = type_wide; 1173 LLVMTypeRef wtype_half; 1174 type_wide_half.length = 2; 1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half); 1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4); 1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4); 1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4); 1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4); 1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4); 1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4); 1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4); 1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4); 1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1185 wtype_half, aevenlo, bevenlo); 1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1187 wtype_half, aoddlo, boddlo); 1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1189 wtype_half, aevenhi, bevenhi); 1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1191 wtype_half, aoddhi, boddhi); 1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2); 1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2); 1194 1195 } 1196 else { 1197 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1198 wider_type, aeven, beven); 1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1200 wider_type, aodd, bodd); 1201 } 1202 } 1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, ""); 1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, ""); 1205 1206 for (i = 0; i < bld->type.length; i += 2) { 1207 shuf[i] = lp_build_const_int32(gallivm, i + 1); 1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length); 1209 } 1210 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1212 1213 for (i = 0; i < bld->type.length; i += 2) { 1214 shuf[i] = lp_build_const_int32(gallivm, i); 1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length); 1216 } 1217 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1219 } 1220 else { 1221 return lp_build_mul_32_lohi(bld, a, b, res_hi); 1222 } 1223 } 1224 1225 1226 /* 1227 * Widening mul, valid for 32x32 bit -> 64bit only. 1228 * Result is low 32bits, high bits returned in res_hi. 1229 * 1230 * Emits generic code. 1231 */ 1232 LLVMValueRef 1233 lp_build_mul_32_lohi(struct lp_build_context *bld, 1234 LLVMValueRef a, 1235 LLVMValueRef b, 1236 LLVMValueRef *res_hi) 1237 { 1238 struct gallivm_state *gallivm = bld->gallivm; 1239 LLVMBuilderRef builder = gallivm->builder; 1240 LLVMValueRef tmp, shift, res_lo; 1241 struct lp_type type_tmp; 1242 LLVMTypeRef wide_type, narrow_type; 1243 1244 type_tmp = bld->type; 1245 narrow_type = lp_build_vec_type(gallivm, type_tmp); 1246 type_tmp.width *= 2; 1247 wide_type = lp_build_vec_type(gallivm, type_tmp); 1248 shift = lp_build_const_vec(gallivm, type_tmp, 32); 1249 1250 if (bld->type.sign) { 1251 a = LLVMBuildSExt(builder, a, wide_type, ""); 1252 b = LLVMBuildSExt(builder, b, wide_type, ""); 1253 } else { 1254 a = LLVMBuildZExt(builder, a, wide_type, ""); 1255 b = LLVMBuildZExt(builder, b, wide_type, ""); 1256 } 1257 tmp = LLVMBuildMul(builder, a, b, ""); 1258 1259 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1260 1261 /* Since we truncate anyway, LShr and AShr are equivalent. */ 1262 tmp = LLVMBuildLShr(builder, tmp, shift, ""); 1263 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1264 1265 return res_lo; 1266 } 1267 1268 1269 /* a * b + c */ 1270 LLVMValueRef 1271 lp_build_mad(struct lp_build_context *bld, 1272 LLVMValueRef a, 1273 LLVMValueRef b, 1274 LLVMValueRef c) 1275 { 1276 const struct lp_type type = bld->type; 1277 if (type.floating) { 1278 return lp_build_fmuladd(bld->gallivm->builder, a, b, c); 1279 } else { 1280 return lp_build_add(bld, lp_build_mul(bld, a, b), c); 1281 } 1282 } 1283 1284 1285 /** 1286 * Small vector x scale multiplication optimization. 1287 */ 1288 LLVMValueRef 1289 lp_build_mul_imm(struct lp_build_context *bld, 1290 LLVMValueRef a, 1291 int b) 1292 { 1293 LLVMBuilderRef builder = bld->gallivm->builder; 1294 LLVMValueRef factor; 1295 1296 assert(lp_check_value(bld->type, a)); 1297 1298 if(b == 0) 1299 return bld->zero; 1300 1301 if(b == 1) 1302 return a; 1303 1304 if(b == -1) 1305 return lp_build_negate(bld, a); 1306 1307 if(b == 2 && bld->type.floating) 1308 return lp_build_add(bld, a, a); 1309 1310 if(util_is_power_of_two(b)) { 1311 unsigned shift = ffs(b) - 1; 1312 1313 if(bld->type.floating) { 1314 #if 0 1315 /* 1316 * Power of two multiplication by directly manipulating the exponent. 1317 * 1318 * XXX: This might not be always faster, it will introduce a small error 1319 * for multiplication by zero, and it will produce wrong results 1320 * for Inf and NaN. 1321 */ 1322 unsigned mantissa = lp_mantissa(bld->type); 1323 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 1324 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 1325 a = LLVMBuildAdd(builder, a, factor, ""); 1326 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 1327 return a; 1328 #endif 1329 } 1330 else { 1331 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 1332 return LLVMBuildShl(builder, a, factor, ""); 1333 } 1334 } 1335 1336 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 1337 return lp_build_mul(bld, a, factor); 1338 } 1339 1340 1341 /** 1342 * Generate a / b 1343 */ 1344 LLVMValueRef 1345 lp_build_div(struct lp_build_context *bld, 1346 LLVMValueRef a, 1347 LLVMValueRef b) 1348 { 1349 LLVMBuilderRef builder = bld->gallivm->builder; 1350 const struct lp_type type = bld->type; 1351 1352 assert(lp_check_value(type, a)); 1353 assert(lp_check_value(type, b)); 1354 1355 if(a == bld->zero) 1356 return bld->zero; 1357 if(a == bld->one && type.floating) 1358 return lp_build_rcp(bld, b); 1359 if(b == bld->zero) 1360 return bld->undef; 1361 if(b == bld->one) 1362 return a; 1363 if(a == bld->undef || b == bld->undef) 1364 return bld->undef; 1365 1366 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1367 if (type.floating) 1368 return LLVMConstFDiv(a, b); 1369 else if (type.sign) 1370 return LLVMConstSDiv(a, b); 1371 else 1372 return LLVMConstUDiv(a, b); 1373 } 1374 1375 /* fast rcp is disabled (just uses div), so makes no sense to try that */ 1376 if(FALSE && 1377 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1378 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && 1379 type.floating) 1380 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 1381 1382 if (type.floating) 1383 return LLVMBuildFDiv(builder, a, b, ""); 1384 else if (type.sign) 1385 return LLVMBuildSDiv(builder, a, b, ""); 1386 else 1387 return LLVMBuildUDiv(builder, a, b, ""); 1388 } 1389 1390 1391 /** 1392 * Linear interpolation helper. 1393 * 1394 * @param normalized whether we are interpolating normalized values, 1395 * encoded in normalized integers, twice as wide. 1396 * 1397 * @sa http://www.stereopsis.com/doubleblend.html 1398 */ 1399 static inline LLVMValueRef 1400 lp_build_lerp_simple(struct lp_build_context *bld, 1401 LLVMValueRef x, 1402 LLVMValueRef v0, 1403 LLVMValueRef v1, 1404 unsigned flags) 1405 { 1406 unsigned half_width = bld->type.width/2; 1407 LLVMBuilderRef builder = bld->gallivm->builder; 1408 LLVMValueRef delta; 1409 LLVMValueRef res; 1410 1411 assert(lp_check_value(bld->type, x)); 1412 assert(lp_check_value(bld->type, v0)); 1413 assert(lp_check_value(bld->type, v1)); 1414 1415 delta = lp_build_sub(bld, v1, v0); 1416 1417 if (bld->type.floating) { 1418 assert(flags == 0); 1419 return lp_build_mad(bld, x, delta, v0); 1420 } 1421 1422 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) { 1423 if (!bld->type.sign) { 1424 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) { 1425 /* 1426 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the 1427 * most-significant-bit to the lowest-significant-bit, so that 1428 * later we can just divide by 2**n instead of 2**n - 1. 1429 */ 1430 1431 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1)); 1432 } 1433 1434 /* (x * delta) >> n */ 1435 res = lp_build_mul(bld, x, delta); 1436 res = lp_build_shr_imm(bld, res, half_width); 1437 } else { 1438 /* 1439 * The rescaling trick above doesn't work for signed numbers, so 1440 * use the 2**n - 1 divison approximation in lp_build_mul_norm 1441 * instead. 1442 */ 1443 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1444 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta); 1445 } 1446 } else { 1447 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1448 res = lp_build_mul(bld, x, delta); 1449 } 1450 1451 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) { 1452 /* 1453 * At this point both res and v0 only use the lower half of the bits, 1454 * the rest is zero. Instead of add / mask, do add with half wide type. 1455 */ 1456 struct lp_type narrow_type; 1457 struct lp_build_context narrow_bld; 1458 1459 memset(&narrow_type, 0, sizeof narrow_type); 1460 narrow_type.sign = bld->type.sign; 1461 narrow_type.width = bld->type.width/2; 1462 narrow_type.length = bld->type.length*2; 1463 1464 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type); 1465 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, ""); 1466 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, ""); 1467 res = lp_build_add(&narrow_bld, v0, res); 1468 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 1469 } else { 1470 res = lp_build_add(bld, v0, res); 1471 1472 if (bld->type.fixed) { 1473 /* 1474 * We need to mask out the high order bits when lerping 8bit 1475 * normalized colors stored on 16bits 1476 */ 1477 /* XXX: This step is necessary for lerping 8bit colors stored on 1478 * 16bits, but it will be wrong for true fixed point use cases. 1479 * Basically we need a more powerful lp_type, capable of further 1480 * distinguishing the values interpretation from the value storage. 1481 */ 1482 LLVMValueRef low_bits; 1483 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1); 1484 res = LLVMBuildAnd(builder, res, low_bits, ""); 1485 } 1486 } 1487 1488 return res; 1489 } 1490 1491 1492 /** 1493 * Linear interpolation. 1494 */ 1495 LLVMValueRef 1496 lp_build_lerp(struct lp_build_context *bld, 1497 LLVMValueRef x, 1498 LLVMValueRef v0, 1499 LLVMValueRef v1, 1500 unsigned flags) 1501 { 1502 const struct lp_type type = bld->type; 1503 LLVMValueRef res; 1504 1505 assert(lp_check_value(type, x)); 1506 assert(lp_check_value(type, v0)); 1507 assert(lp_check_value(type, v1)); 1508 1509 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED)); 1510 1511 if (type.norm) { 1512 struct lp_type wide_type; 1513 struct lp_build_context wide_bld; 1514 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 1515 1516 assert(type.length >= 2); 1517 1518 /* 1519 * Create a wider integer type, enough to hold the 1520 * intermediate result of the multiplication. 1521 */ 1522 memset(&wide_type, 0, sizeof wide_type); 1523 wide_type.sign = type.sign; 1524 wide_type.width = type.width*2; 1525 wide_type.length = type.length/2; 1526 1527 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 1528 1529 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh); 1530 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 1531 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 1532 1533 /* 1534 * Lerp both halves. 1535 */ 1536 1537 flags |= LP_BLD_LERP_WIDE_NORMALIZED; 1538 1539 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags); 1540 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags); 1541 1542 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh); 1543 } else { 1544 res = lp_build_lerp_simple(bld, x, v0, v1, flags); 1545 } 1546 1547 return res; 1548 } 1549 1550 1551 /** 1552 * Bilinear interpolation. 1553 * 1554 * Values indices are in v_{yx}. 1555 */ 1556 LLVMValueRef 1557 lp_build_lerp_2d(struct lp_build_context *bld, 1558 LLVMValueRef x, 1559 LLVMValueRef y, 1560 LLVMValueRef v00, 1561 LLVMValueRef v01, 1562 LLVMValueRef v10, 1563 LLVMValueRef v11, 1564 unsigned flags) 1565 { 1566 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags); 1567 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags); 1568 return lp_build_lerp(bld, y, v0, v1, flags); 1569 } 1570 1571 1572 LLVMValueRef 1573 lp_build_lerp_3d(struct lp_build_context *bld, 1574 LLVMValueRef x, 1575 LLVMValueRef y, 1576 LLVMValueRef z, 1577 LLVMValueRef v000, 1578 LLVMValueRef v001, 1579 LLVMValueRef v010, 1580 LLVMValueRef v011, 1581 LLVMValueRef v100, 1582 LLVMValueRef v101, 1583 LLVMValueRef v110, 1584 LLVMValueRef v111, 1585 unsigned flags) 1586 { 1587 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags); 1588 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags); 1589 return lp_build_lerp(bld, z, v0, v1, flags); 1590 } 1591 1592 1593 /** 1594 * Generate min(a, b) 1595 * Do checks for special cases but not for nans. 1596 */ 1597 LLVMValueRef 1598 lp_build_min(struct lp_build_context *bld, 1599 LLVMValueRef a, 1600 LLVMValueRef b) 1601 { 1602 assert(lp_check_value(bld->type, a)); 1603 assert(lp_check_value(bld->type, b)); 1604 1605 if(a == bld->undef || b == bld->undef) 1606 return bld->undef; 1607 1608 if(a == b) 1609 return a; 1610 1611 if (bld->type.norm) { 1612 if (!bld->type.sign) { 1613 if (a == bld->zero || b == bld->zero) { 1614 return bld->zero; 1615 } 1616 } 1617 if(a == bld->one) 1618 return b; 1619 if(b == bld->one) 1620 return a; 1621 } 1622 1623 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1624 } 1625 1626 1627 /** 1628 * Generate min(a, b) 1629 * NaN's are handled according to the behavior specified by the 1630 * nan_behavior argument. 1631 */ 1632 LLVMValueRef 1633 lp_build_min_ext(struct lp_build_context *bld, 1634 LLVMValueRef a, 1635 LLVMValueRef b, 1636 enum gallivm_nan_behavior nan_behavior) 1637 { 1638 assert(lp_check_value(bld->type, a)); 1639 assert(lp_check_value(bld->type, b)); 1640 1641 if(a == bld->undef || b == bld->undef) 1642 return bld->undef; 1643 1644 if(a == b) 1645 return a; 1646 1647 if (bld->type.norm) { 1648 if (!bld->type.sign) { 1649 if (a == bld->zero || b == bld->zero) { 1650 return bld->zero; 1651 } 1652 } 1653 if(a == bld->one) 1654 return b; 1655 if(b == bld->one) 1656 return a; 1657 } 1658 1659 return lp_build_min_simple(bld, a, b, nan_behavior); 1660 } 1661 1662 /** 1663 * Generate max(a, b) 1664 * Do checks for special cases, but NaN behavior is undefined. 1665 */ 1666 LLVMValueRef 1667 lp_build_max(struct lp_build_context *bld, 1668 LLVMValueRef a, 1669 LLVMValueRef b) 1670 { 1671 assert(lp_check_value(bld->type, a)); 1672 assert(lp_check_value(bld->type, b)); 1673 1674 if(a == bld->undef || b == bld->undef) 1675 return bld->undef; 1676 1677 if(a == b) 1678 return a; 1679 1680 if(bld->type.norm) { 1681 if(a == bld->one || b == bld->one) 1682 return bld->one; 1683 if (!bld->type.sign) { 1684 if (a == bld->zero) { 1685 return b; 1686 } 1687 if (b == bld->zero) { 1688 return a; 1689 } 1690 } 1691 } 1692 1693 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1694 } 1695 1696 1697 /** 1698 * Generate max(a, b) 1699 * Checks for special cases. 1700 * NaN's are handled according to the behavior specified by the 1701 * nan_behavior argument. 1702 */ 1703 LLVMValueRef 1704 lp_build_max_ext(struct lp_build_context *bld, 1705 LLVMValueRef a, 1706 LLVMValueRef b, 1707 enum gallivm_nan_behavior nan_behavior) 1708 { 1709 assert(lp_check_value(bld->type, a)); 1710 assert(lp_check_value(bld->type, b)); 1711 1712 if(a == bld->undef || b == bld->undef) 1713 return bld->undef; 1714 1715 if(a == b) 1716 return a; 1717 1718 if(bld->type.norm) { 1719 if(a == bld->one || b == bld->one) 1720 return bld->one; 1721 if (!bld->type.sign) { 1722 if (a == bld->zero) { 1723 return b; 1724 } 1725 if (b == bld->zero) { 1726 return a; 1727 } 1728 } 1729 } 1730 1731 return lp_build_max_simple(bld, a, b, nan_behavior); 1732 } 1733 1734 /** 1735 * Generate clamp(a, min, max) 1736 * NaN behavior (for any of a, min, max) is undefined. 1737 * Do checks for special cases. 1738 */ 1739 LLVMValueRef 1740 lp_build_clamp(struct lp_build_context *bld, 1741 LLVMValueRef a, 1742 LLVMValueRef min, 1743 LLVMValueRef max) 1744 { 1745 assert(lp_check_value(bld->type, a)); 1746 assert(lp_check_value(bld->type, min)); 1747 assert(lp_check_value(bld->type, max)); 1748 1749 a = lp_build_min(bld, a, max); 1750 a = lp_build_max(bld, a, min); 1751 return a; 1752 } 1753 1754 1755 /** 1756 * Generate clamp(a, 0, 1) 1757 * A NaN will get converted to zero. 1758 */ 1759 LLVMValueRef 1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld, 1761 LLVMValueRef a) 1762 { 1763 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 1764 a = lp_build_min(bld, a, bld->one); 1765 return a; 1766 } 1767 1768 1769 /** 1770 * Generate abs(a) 1771 */ 1772 LLVMValueRef 1773 lp_build_abs(struct lp_build_context *bld, 1774 LLVMValueRef a) 1775 { 1776 LLVMBuilderRef builder = bld->gallivm->builder; 1777 const struct lp_type type = bld->type; 1778 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1779 1780 assert(lp_check_value(type, a)); 1781 1782 if(!type.sign) 1783 return a; 1784 1785 if(type.floating) { 1786 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) { 1787 /* Workaround llvm.org/PR27332 */ 1788 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1789 unsigned long long absMask = ~(1ULL << (type.width - 1)); 1790 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 1791 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1792 a = LLVMBuildAnd(builder, a, mask, ""); 1793 a = LLVMBuildBitCast(builder, a, vec_type, ""); 1794 return a; 1795 } else { 1796 char intrinsic[32]; 1797 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); 1798 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1799 } 1800 } 1801 1802 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) { 1803 switch(type.width) { 1804 case 8: 1805 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1806 case 16: 1807 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1808 case 32: 1809 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1810 } 1811 } 1812 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) { 1813 switch(type.width) { 1814 case 8: 1815 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); 1816 case 16: 1817 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a); 1818 case 32: 1819 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a); 1820 } 1821 } 1822 1823 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero), 1824 a, LLVMBuildNeg(builder, a, "")); 1825 } 1826 1827 1828 LLVMValueRef 1829 lp_build_negate(struct lp_build_context *bld, 1830 LLVMValueRef a) 1831 { 1832 LLVMBuilderRef builder = bld->gallivm->builder; 1833 1834 assert(lp_check_value(bld->type, a)); 1835 1836 if (bld->type.floating) 1837 a = LLVMBuildFNeg(builder, a, ""); 1838 else 1839 a = LLVMBuildNeg(builder, a, ""); 1840 1841 return a; 1842 } 1843 1844 1845 /** Return -1, 0 or +1 depending on the sign of a */ 1846 LLVMValueRef 1847 lp_build_sgn(struct lp_build_context *bld, 1848 LLVMValueRef a) 1849 { 1850 LLVMBuilderRef builder = bld->gallivm->builder; 1851 const struct lp_type type = bld->type; 1852 LLVMValueRef cond; 1853 LLVMValueRef res; 1854 1855 assert(lp_check_value(type, a)); 1856 1857 /* Handle non-zero case */ 1858 if(!type.sign) { 1859 /* if not zero then sign must be positive */ 1860 res = bld->one; 1861 } 1862 else if(type.floating) { 1863 LLVMTypeRef vec_type; 1864 LLVMTypeRef int_type; 1865 LLVMValueRef mask; 1866 LLVMValueRef sign; 1867 LLVMValueRef one; 1868 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1869 1870 int_type = lp_build_int_vec_type(bld->gallivm, type); 1871 vec_type = lp_build_vec_type(bld->gallivm, type); 1872 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1873 1874 /* Take the sign bit and add it to 1 constant */ 1875 sign = LLVMBuildBitCast(builder, a, int_type, ""); 1876 sign = LLVMBuildAnd(builder, sign, mask, ""); 1877 one = LLVMConstBitCast(bld->one, int_type); 1878 res = LLVMBuildOr(builder, sign, one, ""); 1879 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1880 } 1881 else 1882 { 1883 /* signed int/norm/fixed point */ 1884 /* could use psign with sse3 and appropriate vectors here */ 1885 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1886 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1887 res = lp_build_select(bld, cond, bld->one, minus_one); 1888 } 1889 1890 /* Handle zero */ 1891 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1892 res = lp_build_select(bld, cond, bld->zero, res); 1893 1894 return res; 1895 } 1896 1897 1898 /** 1899 * Set the sign of float vector 'a' according to 'sign'. 1900 * If sign==0, return abs(a). 1901 * If sign==1, return -abs(a); 1902 * Other values for sign produce undefined results. 1903 */ 1904 LLVMValueRef 1905 lp_build_set_sign(struct lp_build_context *bld, 1906 LLVMValueRef a, LLVMValueRef sign) 1907 { 1908 LLVMBuilderRef builder = bld->gallivm->builder; 1909 const struct lp_type type = bld->type; 1910 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1911 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1912 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1913 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1914 ~((unsigned long long) 1 << (type.width - 1))); 1915 LLVMValueRef val, res; 1916 1917 assert(type.floating); 1918 assert(lp_check_value(type, a)); 1919 1920 /* val = reinterpret_cast<int>(a) */ 1921 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1922 /* val = val & mask */ 1923 val = LLVMBuildAnd(builder, val, mask, ""); 1924 /* sign = sign << shift */ 1925 sign = LLVMBuildShl(builder, sign, shift, ""); 1926 /* res = val | sign */ 1927 res = LLVMBuildOr(builder, val, sign, ""); 1928 /* res = reinterpret_cast<float>(res) */ 1929 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1930 1931 return res; 1932 } 1933 1934 1935 /** 1936 * Convert vector of (or scalar) int to vector of (or scalar) float. 1937 */ 1938 LLVMValueRef 1939 lp_build_int_to_float(struct lp_build_context *bld, 1940 LLVMValueRef a) 1941 { 1942 LLVMBuilderRef builder = bld->gallivm->builder; 1943 const struct lp_type type = bld->type; 1944 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1945 1946 assert(type.floating); 1947 1948 return LLVMBuildSIToFP(builder, a, vec_type, ""); 1949 } 1950 1951 static boolean 1952 arch_rounding_available(const struct lp_type type) 1953 { 1954 if ((util_cpu_caps.has_sse4_1 && 1955 (type.length == 1 || type.width*type.length == 128)) || 1956 (util_cpu_caps.has_avx && type.width*type.length == 256) || 1957 (util_cpu_caps.has_avx512f && type.width*type.length == 512)) 1958 return TRUE; 1959 else if ((util_cpu_caps.has_altivec && 1960 (type.width == 32 && type.length == 4))) 1961 return TRUE; 1962 1963 return FALSE; 1964 } 1965 1966 enum lp_build_round_mode 1967 { 1968 LP_BUILD_ROUND_NEAREST = 0, 1969 LP_BUILD_ROUND_FLOOR = 1, 1970 LP_BUILD_ROUND_CEIL = 2, 1971 LP_BUILD_ROUND_TRUNCATE = 3 1972 }; 1973 1974 static inline LLVMValueRef 1975 lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1976 LLVMValueRef a) 1977 { 1978 LLVMBuilderRef builder = bld->gallivm->builder; 1979 const struct lp_type type = bld->type; 1980 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1981 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1982 const char *intrinsic; 1983 LLVMValueRef res; 1984 1985 assert(type.floating); 1986 /* using the double precision conversions is a bit more complicated */ 1987 assert(type.width == 32); 1988 1989 assert(lp_check_value(type, a)); 1990 assert(util_cpu_caps.has_sse2); 1991 1992 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1993 if (type.length == 1) { 1994 LLVMTypeRef vec_type; 1995 LLVMValueRef undef; 1996 LLVMValueRef arg; 1997 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1998 1999 vec_type = LLVMVectorType(bld->elem_type, 4); 2000 2001 intrinsic = "llvm.x86.sse.cvtss2si"; 2002 2003 undef = LLVMGetUndef(vec_type); 2004 2005 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 2006 2007 res = lp_build_intrinsic_unary(builder, intrinsic, 2008 ret_type, arg); 2009 } 2010 else { 2011 if (type.width* type.length == 128) { 2012 intrinsic = "llvm.x86.sse2.cvtps2dq"; 2013 } 2014 else { 2015 assert(type.width*type.length == 256); 2016 assert(util_cpu_caps.has_avx); 2017 2018 intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 2019 } 2020 res = lp_build_intrinsic_unary(builder, intrinsic, 2021 ret_type, a); 2022 } 2023 2024 return res; 2025 } 2026 2027 2028 /* 2029 */ 2030 static inline LLVMValueRef 2031 lp_build_round_altivec(struct lp_build_context *bld, 2032 LLVMValueRef a, 2033 enum lp_build_round_mode mode) 2034 { 2035 LLVMBuilderRef builder = bld->gallivm->builder; 2036 const struct lp_type type = bld->type; 2037 const char *intrinsic = NULL; 2038 2039 assert(type.floating); 2040 2041 assert(lp_check_value(type, a)); 2042 assert(util_cpu_caps.has_altivec); 2043 2044 (void)type; 2045 2046 switch (mode) { 2047 case LP_BUILD_ROUND_NEAREST: 2048 intrinsic = "llvm.ppc.altivec.vrfin"; 2049 break; 2050 case LP_BUILD_ROUND_FLOOR: 2051 intrinsic = "llvm.ppc.altivec.vrfim"; 2052 break; 2053 case LP_BUILD_ROUND_CEIL: 2054 intrinsic = "llvm.ppc.altivec.vrfip"; 2055 break; 2056 case LP_BUILD_ROUND_TRUNCATE: 2057 intrinsic = "llvm.ppc.altivec.vrfiz"; 2058 break; 2059 } 2060 2061 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2062 } 2063 2064 static inline LLVMValueRef 2065 lp_build_round_arch(struct lp_build_context *bld, 2066 LLVMValueRef a, 2067 enum lp_build_round_mode mode) 2068 { 2069 if (util_cpu_caps.has_sse4_1) { 2070 LLVMBuilderRef builder = bld->gallivm->builder; 2071 const struct lp_type type = bld->type; 2072 const char *intrinsic_root; 2073 char intrinsic[32]; 2074 2075 assert(type.floating); 2076 assert(lp_check_value(type, a)); 2077 (void)type; 2078 2079 switch (mode) { 2080 case LP_BUILD_ROUND_NEAREST: 2081 intrinsic_root = "llvm.nearbyint"; 2082 break; 2083 case LP_BUILD_ROUND_FLOOR: 2084 intrinsic_root = "llvm.floor"; 2085 break; 2086 case LP_BUILD_ROUND_CEIL: 2087 intrinsic_root = "llvm.ceil"; 2088 break; 2089 case LP_BUILD_ROUND_TRUNCATE: 2090 intrinsic_root = "llvm.trunc"; 2091 break; 2092 } 2093 2094 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); 2095 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2096 } 2097 else /* (util_cpu_caps.has_altivec) */ 2098 return lp_build_round_altivec(bld, a, mode); 2099 } 2100 2101 /** 2102 * Return the integer part of a float (vector) value (== round toward zero). 2103 * The returned value is a float (vector). 2104 * Ex: trunc(-1.5) = -1.0 2105 */ 2106 LLVMValueRef 2107 lp_build_trunc(struct lp_build_context *bld, 2108 LLVMValueRef a) 2109 { 2110 LLVMBuilderRef builder = bld->gallivm->builder; 2111 const struct lp_type type = bld->type; 2112 2113 assert(type.floating); 2114 assert(lp_check_value(type, a)); 2115 2116 if (arch_rounding_available(type)) { 2117 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE); 2118 } 2119 else { 2120 const struct lp_type type = bld->type; 2121 struct lp_type inttype; 2122 struct lp_build_context intbld; 2123 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2124 LLVMValueRef trunc, res, anosign, mask; 2125 LLVMTypeRef int_vec_type = bld->int_vec_type; 2126 LLVMTypeRef vec_type = bld->vec_type; 2127 2128 assert(type.width == 32); /* might want to handle doubles at some point */ 2129 2130 inttype = type; 2131 inttype.floating = 0; 2132 lp_build_context_init(&intbld, bld->gallivm, inttype); 2133 2134 /* round by truncation */ 2135 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2136 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2137 2138 /* mask out sign bit */ 2139 anosign = lp_build_abs(bld, a); 2140 /* 2141 * mask out all values if anosign > 2^24 2142 * This should work both for large ints (all rounding is no-op for them 2143 * because such floats are always exact) as well as special cases like 2144 * NaNs, Infs (taking advantage of the fact they use max exponent). 2145 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2146 */ 2147 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2148 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2149 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2150 return lp_build_select(bld, mask, a, res); 2151 } 2152 } 2153 2154 2155 /** 2156 * Return float (vector) rounded to nearest integer (vector). The returned 2157 * value is a float (vector). 2158 * Ex: round(0.9) = 1.0 2159 * Ex: round(-1.5) = -2.0 2160 */ 2161 LLVMValueRef 2162 lp_build_round(struct lp_build_context *bld, 2163 LLVMValueRef a) 2164 { 2165 LLVMBuilderRef builder = bld->gallivm->builder; 2166 const struct lp_type type = bld->type; 2167 2168 assert(type.floating); 2169 assert(lp_check_value(type, a)); 2170 2171 if (arch_rounding_available(type)) { 2172 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2173 } 2174 else { 2175 const struct lp_type type = bld->type; 2176 struct lp_type inttype; 2177 struct lp_build_context intbld; 2178 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2179 LLVMValueRef res, anosign, mask; 2180 LLVMTypeRef int_vec_type = bld->int_vec_type; 2181 LLVMTypeRef vec_type = bld->vec_type; 2182 2183 assert(type.width == 32); /* might want to handle doubles at some point */ 2184 2185 inttype = type; 2186 inttype.floating = 0; 2187 lp_build_context_init(&intbld, bld->gallivm, inttype); 2188 2189 res = lp_build_iround(bld, a); 2190 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 2191 2192 /* mask out sign bit */ 2193 anosign = lp_build_abs(bld, a); 2194 /* 2195 * mask out all values if anosign > 2^24 2196 * This should work both for large ints (all rounding is no-op for them 2197 * because such floats are always exact) as well as special cases like 2198 * NaNs, Infs (taking advantage of the fact they use max exponent). 2199 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2200 */ 2201 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2202 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2203 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2204 return lp_build_select(bld, mask, a, res); 2205 } 2206 } 2207 2208 2209 /** 2210 * Return floor of float (vector), result is a float (vector) 2211 * Ex: floor(1.1) = 1.0 2212 * Ex: floor(-1.1) = -2.0 2213 */ 2214 LLVMValueRef 2215 lp_build_floor(struct lp_build_context *bld, 2216 LLVMValueRef a) 2217 { 2218 LLVMBuilderRef builder = bld->gallivm->builder; 2219 const struct lp_type type = bld->type; 2220 2221 assert(type.floating); 2222 assert(lp_check_value(type, a)); 2223 2224 if (arch_rounding_available(type)) { 2225 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2226 } 2227 else { 2228 const struct lp_type type = bld->type; 2229 struct lp_type inttype; 2230 struct lp_build_context intbld; 2231 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2232 LLVMValueRef trunc, res, anosign, mask; 2233 LLVMTypeRef int_vec_type = bld->int_vec_type; 2234 LLVMTypeRef vec_type = bld->vec_type; 2235 2236 if (type.width != 32) { 2237 char intrinsic[32]; 2238 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type); 2239 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2240 } 2241 2242 assert(type.width == 32); /* might want to handle doubles at some point */ 2243 2244 inttype = type; 2245 inttype.floating = 0; 2246 lp_build_context_init(&intbld, bld->gallivm, inttype); 2247 2248 /* round by truncation */ 2249 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2250 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2251 2252 if (type.sign) { 2253 LLVMValueRef tmp; 2254 2255 /* 2256 * fix values if rounding is wrong (for non-special cases) 2257 * - this is the case if trunc > a 2258 */ 2259 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a); 2260 /* tmp = trunc > a ? 1.0 : 0.0 */ 2261 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2262 tmp = lp_build_and(&intbld, mask, tmp); 2263 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2264 res = lp_build_sub(bld, res, tmp); 2265 } 2266 2267 /* mask out sign bit */ 2268 anosign = lp_build_abs(bld, a); 2269 /* 2270 * mask out all values if anosign > 2^24 2271 * This should work both for large ints (all rounding is no-op for them 2272 * because such floats are always exact) as well as special cases like 2273 * NaNs, Infs (taking advantage of the fact they use max exponent). 2274 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2275 */ 2276 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2277 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2278 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2279 return lp_build_select(bld, mask, a, res); 2280 } 2281 } 2282 2283 2284 /** 2285 * Return ceiling of float (vector), returning float (vector). 2286 * Ex: ceil( 1.1) = 2.0 2287 * Ex: ceil(-1.1) = -1.0 2288 */ 2289 LLVMValueRef 2290 lp_build_ceil(struct lp_build_context *bld, 2291 LLVMValueRef a) 2292 { 2293 LLVMBuilderRef builder = bld->gallivm->builder; 2294 const struct lp_type type = bld->type; 2295 2296 assert(type.floating); 2297 assert(lp_check_value(type, a)); 2298 2299 if (arch_rounding_available(type)) { 2300 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2301 } 2302 else { 2303 const struct lp_type type = bld->type; 2304 struct lp_type inttype; 2305 struct lp_build_context intbld; 2306 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2307 LLVMValueRef trunc, res, anosign, mask, tmp; 2308 LLVMTypeRef int_vec_type = bld->int_vec_type; 2309 LLVMTypeRef vec_type = bld->vec_type; 2310 2311 if (type.width != 32) { 2312 char intrinsic[32]; 2313 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type); 2314 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2315 } 2316 2317 assert(type.width == 32); /* might want to handle doubles at some point */ 2318 2319 inttype = type; 2320 inttype.floating = 0; 2321 lp_build_context_init(&intbld, bld->gallivm, inttype); 2322 2323 /* round by truncation */ 2324 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2325 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc"); 2326 2327 /* 2328 * fix values if rounding is wrong (for non-special cases) 2329 * - this is the case if trunc < a 2330 */ 2331 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2332 /* tmp = trunc < a ? 1.0 : 0.0 */ 2333 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2334 tmp = lp_build_and(&intbld, mask, tmp); 2335 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2336 res = lp_build_add(bld, trunc, tmp); 2337 2338 /* mask out sign bit */ 2339 anosign = lp_build_abs(bld, a); 2340 /* 2341 * mask out all values if anosign > 2^24 2342 * This should work both for large ints (all rounding is no-op for them 2343 * because such floats are always exact) as well as special cases like 2344 * NaNs, Infs (taking advantage of the fact they use max exponent). 2345 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2346 */ 2347 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2348 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2349 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2350 return lp_build_select(bld, mask, a, res); 2351 } 2352 } 2353 2354 2355 /** 2356 * Return fractional part of 'a' computed as a - floor(a) 2357 * Typically used in texture coord arithmetic. 2358 */ 2359 LLVMValueRef 2360 lp_build_fract(struct lp_build_context *bld, 2361 LLVMValueRef a) 2362 { 2363 assert(bld->type.floating); 2364 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 2365 } 2366 2367 2368 /** 2369 * Prevent returning 1.0 for very small negative values of 'a' by clamping 2370 * against 0.99999(9). (Will also return that value for NaNs.) 2371 */ 2372 static inline LLVMValueRef 2373 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 2374 { 2375 LLVMValueRef max; 2376 2377 /* this is the largest number smaller than 1.0 representable as float */ 2378 max = lp_build_const_vec(bld->gallivm, bld->type, 2379 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 2380 return lp_build_min_ext(bld, fract, max, 2381 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 2382 } 2383 2384 2385 /** 2386 * Same as lp_build_fract, but guarantees that the result is always smaller 2387 * than one. Will also return the smaller-than-one value for infs, NaNs. 2388 */ 2389 LLVMValueRef 2390 lp_build_fract_safe(struct lp_build_context *bld, 2391 LLVMValueRef a) 2392 { 2393 return clamp_fract(bld, lp_build_fract(bld, a)); 2394 } 2395 2396 2397 /** 2398 * Return the integer part of a float (vector) value (== round toward zero). 2399 * The returned value is an integer (vector). 2400 * Ex: itrunc(-1.5) = -1 2401 */ 2402 LLVMValueRef 2403 lp_build_itrunc(struct lp_build_context *bld, 2404 LLVMValueRef a) 2405 { 2406 LLVMBuilderRef builder = bld->gallivm->builder; 2407 const struct lp_type type = bld->type; 2408 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2409 2410 assert(type.floating); 2411 assert(lp_check_value(type, a)); 2412 2413 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2414 } 2415 2416 2417 /** 2418 * Return float (vector) rounded to nearest integer (vector). The returned 2419 * value is an integer (vector). 2420 * Ex: iround(0.9) = 1 2421 * Ex: iround(-1.5) = -2 2422 */ 2423 LLVMValueRef 2424 lp_build_iround(struct lp_build_context *bld, 2425 LLVMValueRef a) 2426 { 2427 LLVMBuilderRef builder = bld->gallivm->builder; 2428 const struct lp_type type = bld->type; 2429 LLVMTypeRef int_vec_type = bld->int_vec_type; 2430 LLVMValueRef res; 2431 2432 assert(type.floating); 2433 2434 assert(lp_check_value(type, a)); 2435 2436 if ((util_cpu_caps.has_sse2 && 2437 ((type.width == 32) && (type.length == 1 || type.length == 4))) || 2438 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 2439 return lp_build_iround_nearest_sse2(bld, a); 2440 } 2441 if (arch_rounding_available(type)) { 2442 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2443 } 2444 else { 2445 LLVMValueRef half; 2446 2447 half = lp_build_const_vec(bld->gallivm, type, 0.5); 2448 2449 if (type.sign) { 2450 LLVMTypeRef vec_type = bld->vec_type; 2451 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 2452 (unsigned long long)1 << (type.width - 1)); 2453 LLVMValueRef sign; 2454 2455 /* get sign bit */ 2456 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 2457 sign = LLVMBuildAnd(builder, sign, mask, ""); 2458 2459 /* sign * 0.5 */ 2460 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 2461 half = LLVMBuildOr(builder, sign, half, ""); 2462 half = LLVMBuildBitCast(builder, half, vec_type, ""); 2463 } 2464 2465 res = LLVMBuildFAdd(builder, a, half, ""); 2466 } 2467 2468 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 2469 2470 return res; 2471 } 2472 2473 2474 /** 2475 * Return floor of float (vector), result is an int (vector) 2476 * Ex: ifloor(1.1) = 1.0 2477 * Ex: ifloor(-1.1) = -2.0 2478 */ 2479 LLVMValueRef 2480 lp_build_ifloor(struct lp_build_context *bld, 2481 LLVMValueRef a) 2482 { 2483 LLVMBuilderRef builder = bld->gallivm->builder; 2484 const struct lp_type type = bld->type; 2485 LLVMTypeRef int_vec_type = bld->int_vec_type; 2486 LLVMValueRef res; 2487 2488 assert(type.floating); 2489 assert(lp_check_value(type, a)); 2490 2491 res = a; 2492 if (type.sign) { 2493 if (arch_rounding_available(type)) { 2494 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2495 } 2496 else { 2497 struct lp_type inttype; 2498 struct lp_build_context intbld; 2499 LLVMValueRef trunc, itrunc, mask; 2500 2501 assert(type.floating); 2502 assert(lp_check_value(type, a)); 2503 2504 inttype = type; 2505 inttype.floating = 0; 2506 lp_build_context_init(&intbld, bld->gallivm, inttype); 2507 2508 /* round by truncation */ 2509 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2510 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc"); 2511 2512 /* 2513 * fix values if rounding is wrong (for non-special cases) 2514 * - this is the case if trunc > a 2515 * The results of doing this with NaNs, very large values etc. 2516 * are undefined but this seems to be the case anyway. 2517 */ 2518 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a); 2519 /* cheapie minus one with mask since the mask is minus one / zero */ 2520 return lp_build_add(&intbld, itrunc, mask); 2521 } 2522 } 2523 2524 /* round to nearest (toward zero) */ 2525 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 2526 2527 return res; 2528 } 2529 2530 2531 /** 2532 * Return ceiling of float (vector), returning int (vector). 2533 * Ex: iceil( 1.1) = 2 2534 * Ex: iceil(-1.1) = -1 2535 */ 2536 LLVMValueRef 2537 lp_build_iceil(struct lp_build_context *bld, 2538 LLVMValueRef a) 2539 { 2540 LLVMBuilderRef builder = bld->gallivm->builder; 2541 const struct lp_type type = bld->type; 2542 LLVMTypeRef int_vec_type = bld->int_vec_type; 2543 LLVMValueRef res; 2544 2545 assert(type.floating); 2546 assert(lp_check_value(type, a)); 2547 2548 if (arch_rounding_available(type)) { 2549 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2550 } 2551 else { 2552 struct lp_type inttype; 2553 struct lp_build_context intbld; 2554 LLVMValueRef trunc, itrunc, mask; 2555 2556 assert(type.floating); 2557 assert(lp_check_value(type, a)); 2558 2559 inttype = type; 2560 inttype.floating = 0; 2561 lp_build_context_init(&intbld, bld->gallivm, inttype); 2562 2563 /* round by truncation */ 2564 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2565 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc"); 2566 2567 /* 2568 * fix values if rounding is wrong (for non-special cases) 2569 * - this is the case if trunc < a 2570 * The results of doing this with NaNs, very large values etc. 2571 * are undefined but this seems to be the case anyway. 2572 */ 2573 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2574 /* cheapie plus one with mask since the mask is minus one / zero */ 2575 return lp_build_sub(&intbld, itrunc, mask); 2576 } 2577 2578 /* round to nearest (toward zero) */ 2579 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 2580 2581 return res; 2582 } 2583 2584 2585 /** 2586 * Combined ifloor() & fract(). 2587 * 2588 * Preferred to calling the functions separately, as it will ensure that the 2589 * strategy (floor() vs ifloor()) that results in less redundant work is used. 2590 */ 2591 void 2592 lp_build_ifloor_fract(struct lp_build_context *bld, 2593 LLVMValueRef a, 2594 LLVMValueRef *out_ipart, 2595 LLVMValueRef *out_fpart) 2596 { 2597 LLVMBuilderRef builder = bld->gallivm->builder; 2598 const struct lp_type type = bld->type; 2599 LLVMValueRef ipart; 2600 2601 assert(type.floating); 2602 assert(lp_check_value(type, a)); 2603 2604 if (arch_rounding_available(type)) { 2605 /* 2606 * floor() is easier. 2607 */ 2608 2609 ipart = lp_build_floor(bld, a); 2610 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2611 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 2612 } 2613 else { 2614 /* 2615 * ifloor() is easier. 2616 */ 2617 2618 *out_ipart = lp_build_ifloor(bld, a); 2619 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 2620 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2621 } 2622 } 2623 2624 2625 /** 2626 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 2627 * always smaller than one. 2628 */ 2629 void 2630 lp_build_ifloor_fract_safe(struct lp_build_context *bld, 2631 LLVMValueRef a, 2632 LLVMValueRef *out_ipart, 2633 LLVMValueRef *out_fpart) 2634 { 2635 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 2636 *out_fpart = clamp_fract(bld, *out_fpart); 2637 } 2638 2639 2640 LLVMValueRef 2641 lp_build_sqrt(struct lp_build_context *bld, 2642 LLVMValueRef a) 2643 { 2644 LLVMBuilderRef builder = bld->gallivm->builder; 2645 const struct lp_type type = bld->type; 2646 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2647 char intrinsic[32]; 2648 2649 assert(lp_check_value(type, a)); 2650 2651 assert(type.floating); 2652 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type); 2653 2654 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2655 } 2656 2657 2658 /** 2659 * Do one Newton-Raphson step to improve reciprocate precision: 2660 * 2661 * x_{i+1} = x_i * (2 - a * x_i) 2662 * 2663 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 2664 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 2665 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 2666 * halo. It would be necessary to clamp the argument to prevent this. 2667 * 2668 * See also: 2669 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 2670 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 2671 */ 2672 static inline LLVMValueRef 2673 lp_build_rcp_refine(struct lp_build_context *bld, 2674 LLVMValueRef a, 2675 LLVMValueRef rcp_a) 2676 { 2677 LLVMBuilderRef builder = bld->gallivm->builder; 2678 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 2679 LLVMValueRef res; 2680 2681 res = LLVMBuildFMul(builder, a, rcp_a, ""); 2682 res = LLVMBuildFSub(builder, two, res, ""); 2683 res = LLVMBuildFMul(builder, rcp_a, res, ""); 2684 2685 return res; 2686 } 2687 2688 2689 LLVMValueRef 2690 lp_build_rcp(struct lp_build_context *bld, 2691 LLVMValueRef a) 2692 { 2693 LLVMBuilderRef builder = bld->gallivm->builder; 2694 const struct lp_type type = bld->type; 2695 2696 assert(lp_check_value(type, a)); 2697 2698 if(a == bld->zero) 2699 return bld->undef; 2700 if(a == bld->one) 2701 return bld->one; 2702 if(a == bld->undef) 2703 return bld->undef; 2704 2705 assert(type.floating); 2706 2707 if(LLVMIsConstant(a)) 2708 return LLVMConstFDiv(bld->one, a); 2709 2710 /* 2711 * We don't use RCPPS because: 2712 * - it only has 10bits of precision 2713 * - it doesn't even get the reciprocate of 1.0 exactly 2714 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 2715 * - for recent processors the benefit over DIVPS is marginal, a case 2716 * dependent 2717 * 2718 * We could still use it on certain processors if benchmarks show that the 2719 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 2720 * particular uses that require less workarounds. 2721 */ 2722 2723 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 2724 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ 2725 const unsigned num_iterations = 0; 2726 LLVMValueRef res; 2727 unsigned i; 2728 const char *intrinsic = NULL; 2729 2730 if (type.length == 4) { 2731 intrinsic = "llvm.x86.sse.rcp.ps"; 2732 } 2733 else { 2734 intrinsic = "llvm.x86.avx.rcp.ps.256"; 2735 } 2736 2737 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2738 2739 for (i = 0; i < num_iterations; ++i) { 2740 res = lp_build_rcp_refine(bld, a, res); 2741 } 2742 2743 return res; 2744 } 2745 2746 return LLVMBuildFDiv(builder, bld->one, a, ""); 2747 } 2748 2749 2750 /** 2751 * Do one Newton-Raphson step to improve rsqrt precision: 2752 * 2753 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 2754 * 2755 * See also Intel 64 and IA-32 Architectures Optimization Manual. 2756 */ 2757 static inline LLVMValueRef 2758 lp_build_rsqrt_refine(struct lp_build_context *bld, 2759 LLVMValueRef a, 2760 LLVMValueRef rsqrt_a) 2761 { 2762 LLVMBuilderRef builder = bld->gallivm->builder; 2763 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 2764 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 2765 LLVMValueRef res; 2766 2767 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 2768 res = LLVMBuildFMul(builder, a, res, ""); 2769 res = LLVMBuildFSub(builder, three, res, ""); 2770 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 2771 res = LLVMBuildFMul(builder, half, res, ""); 2772 2773 return res; 2774 } 2775 2776 2777 /** 2778 * Generate 1/sqrt(a). 2779 * Result is undefined for values < 0, infinity for +0. 2780 */ 2781 LLVMValueRef 2782 lp_build_rsqrt(struct lp_build_context *bld, 2783 LLVMValueRef a) 2784 { 2785 const struct lp_type type = bld->type; 2786 2787 assert(lp_check_value(type, a)); 2788 2789 assert(type.floating); 2790 2791 /* 2792 * This should be faster but all denormals will end up as infinity. 2793 */ 2794 if (0 && lp_build_fast_rsqrt_available(type)) { 2795 const unsigned num_iterations = 1; 2796 LLVMValueRef res; 2797 unsigned i; 2798 2799 /* rsqrt(1.0) != 1.0 here */ 2800 res = lp_build_fast_rsqrt(bld, a); 2801 2802 if (num_iterations) { 2803 /* 2804 * Newton-Raphson will result in NaN instead of infinity for zero, 2805 * and NaN instead of zero for infinity. 2806 * Also, need to ensure rsqrt(1.0) == 1.0. 2807 * All numbers smaller than FLT_MIN will result in +infinity 2808 * (rsqrtps treats all denormals as zero). 2809 */ 2810 LLVMValueRef cmp; 2811 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN); 2812 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY); 2813 2814 for (i = 0; i < num_iterations; ++i) { 2815 res = lp_build_rsqrt_refine(bld, a, res); 2816 } 2817 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min); 2818 res = lp_build_select(bld, cmp, inf, res); 2819 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf); 2820 res = lp_build_select(bld, cmp, bld->zero, res); 2821 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one); 2822 res = lp_build_select(bld, cmp, bld->one, res); 2823 } 2824 2825 return res; 2826 } 2827 2828 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2829 } 2830 2831 /** 2832 * If there's a fast (inaccurate) rsqrt instruction available 2833 * (caller may want to avoid to call rsqrt_fast if it's not available, 2834 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if 2835 * unavailable it would result in sqrt/div/mul so obviously 2836 * much better to just call sqrt, skipping both div and mul). 2837 */ 2838 boolean 2839 lp_build_fast_rsqrt_available(struct lp_type type) 2840 { 2841 assert(type.floating); 2842 2843 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 2844 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 2845 return true; 2846 } 2847 return false; 2848 } 2849 2850 2851 /** 2852 * Generate 1/sqrt(a). 2853 * Result is undefined for values < 0, infinity for +0. 2854 * Precision is limited, only ~10 bits guaranteed 2855 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). 2856 */ 2857 LLVMValueRef 2858 lp_build_fast_rsqrt(struct lp_build_context *bld, 2859 LLVMValueRef a) 2860 { 2861 LLVMBuilderRef builder = bld->gallivm->builder; 2862 const struct lp_type type = bld->type; 2863 2864 assert(lp_check_value(type, a)); 2865 2866 if (lp_build_fast_rsqrt_available(type)) { 2867 const char *intrinsic = NULL; 2868 2869 if (type.length == 4) { 2870 intrinsic = "llvm.x86.sse.rsqrt.ps"; 2871 } 2872 else { 2873 intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2874 } 2875 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2876 } 2877 else { 2878 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); 2879 } 2880 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2881 } 2882 2883 2884 /** 2885 * Generate sin(a) or cos(a) using polynomial approximation. 2886 * TODO: it might be worth recognizing sin and cos using same source 2887 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time 2888 * would be way cheaper than calculating (nearly) everything twice... 2889 * Not sure it's common enough to be worth bothering however, scs 2890 * opcode could also benefit from calculating both though. 2891 */ 2892 static LLVMValueRef 2893 lp_build_sin_or_cos(struct lp_build_context *bld, 2894 LLVMValueRef a, 2895 boolean cos) 2896 { 2897 struct gallivm_state *gallivm = bld->gallivm; 2898 LLVMBuilderRef b = gallivm->builder; 2899 struct lp_type int_type = lp_int_type(bld->type); 2900 2901 /* 2902 * take the absolute value, 2903 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2904 */ 2905 2906 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2907 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2908 2909 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2910 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2911 2912 /* 2913 * scale by 4/Pi 2914 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2915 */ 2916 2917 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2918 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2919 2920 /* 2921 * store the integer part of y in mm0 2922 * emm2 = _mm_cvttps_epi32(y); 2923 */ 2924 2925 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2926 2927 /* 2928 * j=(j+1) & (~1) (see the cephes sources) 2929 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2930 */ 2931 2932 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2933 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2934 /* 2935 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2936 */ 2937 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2938 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2939 2940 /* 2941 * y = _mm_cvtepi32_ps(emm2); 2942 */ 2943 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2944 2945 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2946 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2947 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2948 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2949 2950 /* 2951 * Argument used for poly selection and sign bit determination 2952 * is different for sin vs. cos. 2953 */ 2954 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") : 2955 emm2_and; 2956 2957 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4, 2958 LLVMBuildNot(b, emm2_2, ""), ""), 2959 const_29, "sign_bit") : 2960 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si, 2961 LLVMBuildShl(b, emm2_add, 2962 const_29, ""), ""), 2963 sign_mask, "sign_bit"); 2964 2965 /* 2966 * get the polynom selection mask 2967 * there is one polynom for 0 <= x <= Pi/4 2968 * and another one for Pi/4<x<=Pi/2 2969 * Both branches will be computed. 2970 * 2971 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2972 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2973 */ 2974 2975 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3"); 2976 LLVMValueRef poly_mask = lp_build_compare(gallivm, 2977 int_type, PIPE_FUNC_EQUAL, 2978 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2979 2980 /* 2981 * _PS_CONST(minus_cephes_DP1, -0.78515625); 2982 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2983 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2984 */ 2985 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2986 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2987 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2988 2989 /* 2990 * The magic pass: "Extended precision modular arithmetic" 2991 * x = ((x - y * DP1) - y * DP2) - y * DP3; 2992 */ 2993 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs); 2994 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1); 2995 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2); 2996 2997 /* 2998 * Evaluate the first polynom (0 <= x <= Pi/4) 2999 * 3000 * z = _mm_mul_ps(x,x); 3001 */ 3002 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 3003 3004 /* 3005 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 3006 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 3007 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 3008 */ 3009 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 3010 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 3011 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 3012 3013 /* 3014 * y = *(v4sf*)_ps_coscof_p0; 3015 * y = _mm_mul_ps(y, z); 3016 */ 3017 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1); 3018 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2); 3019 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 3020 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 3021 3022 3023 /* 3024 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 3025 * y = _mm_sub_ps(y, tmp); 3026 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 3027 */ 3028 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 3029 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 3030 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 3031 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 3032 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 3033 3034 /* 3035 * _PS_CONST(sincof_p0, -1.9515295891E-4); 3036 * _PS_CONST(sincof_p1, 8.3321608736E-3); 3037 * _PS_CONST(sincof_p2, -1.6666654611E-1); 3038 */ 3039 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 3040 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 3041 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 3042 3043 /* 3044 * Evaluate the second polynom (Pi/4 <= x <= 0) 3045 * 3046 * y2 = *(v4sf*)_ps_sincof_p0; 3047 * y2 = _mm_mul_ps(y2, z); 3048 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 3049 * y2 = _mm_mul_ps(y2, z); 3050 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 3051 * y2 = _mm_mul_ps(y2, z); 3052 * y2 = _mm_mul_ps(y2, x); 3053 * y2 = _mm_add_ps(y2, x); 3054 */ 3055 3056 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1); 3057 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2); 3058 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 3059 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3); 3060 3061 /* 3062 * select the correct result from the two polynoms 3063 * xmm3 = poly_mask; 3064 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 3065 * y = _mm_andnot_ps(xmm3, y); 3066 * y = _mm_or_ps(y,y2); 3067 */ 3068 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 3069 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 3070 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 3071 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv"); 3072 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 3073 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine"); 3074 3075 /* 3076 * update the sign 3077 * y = _mm_xor_ps(y, sign_bit); 3078 */ 3079 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign"); 3080 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 3081 3082 LLVMValueRef isfinite = lp_build_isfinite(bld, a); 3083 3084 /* clamp output to be within [-1, 1] */ 3085 y_result = lp_build_clamp(bld, y_result, 3086 lp_build_const_vec(bld->gallivm, bld->type, -1.f), 3087 lp_build_const_vec(bld->gallivm, bld->type, 1.f)); 3088 /* If a is -inf, inf or NaN then return NaN */ 3089 y_result = lp_build_select(bld, isfinite, y_result, 3090 lp_build_const_vec(bld->gallivm, bld->type, NAN)); 3091 return y_result; 3092 } 3093 3094 3095 /** 3096 * Generate sin(a) 3097 */ 3098 LLVMValueRef 3099 lp_build_sin(struct lp_build_context *bld, 3100 LLVMValueRef a) 3101 { 3102 return lp_build_sin_or_cos(bld, a, FALSE); 3103 } 3104 3105 3106 /** 3107 * Generate cos(a) 3108 */ 3109 LLVMValueRef 3110 lp_build_cos(struct lp_build_context *bld, 3111 LLVMValueRef a) 3112 { 3113 return lp_build_sin_or_cos(bld, a, TRUE); 3114 } 3115 3116 3117 /** 3118 * Generate pow(x, y) 3119 */ 3120 LLVMValueRef 3121 lp_build_pow(struct lp_build_context *bld, 3122 LLVMValueRef x, 3123 LLVMValueRef y) 3124 { 3125 /* TODO: optimize the constant case */ 3126 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3127 LLVMIsConstant(x) && LLVMIsConstant(y)) { 3128 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3129 __FUNCTION__); 3130 } 3131 3132 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 3133 } 3134 3135 3136 /** 3137 * Generate exp(x) 3138 */ 3139 LLVMValueRef 3140 lp_build_exp(struct lp_build_context *bld, 3141 LLVMValueRef x) 3142 { 3143 /* log2(e) = 1/log(2) */ 3144 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 3145 1.4426950408889634); 3146 3147 assert(lp_check_value(bld->type, x)); 3148 3149 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 3150 } 3151 3152 3153 /** 3154 * Generate log(x) 3155 * Behavior is undefined with infs, 0s and nans 3156 */ 3157 LLVMValueRef 3158 lp_build_log(struct lp_build_context *bld, 3159 LLVMValueRef x) 3160 { 3161 /* log(2) */ 3162 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3163 0.69314718055994529); 3164 3165 assert(lp_check_value(bld->type, x)); 3166 3167 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 3168 } 3169 3170 /** 3171 * Generate log(x) that handles edge cases (infs, 0s and nans) 3172 */ 3173 LLVMValueRef 3174 lp_build_log_safe(struct lp_build_context *bld, 3175 LLVMValueRef x) 3176 { 3177 /* log(2) */ 3178 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3179 0.69314718055994529); 3180 3181 assert(lp_check_value(bld->type, x)); 3182 3183 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x)); 3184 } 3185 3186 3187 /** 3188 * Generate polynomial. 3189 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 3190 */ 3191 LLVMValueRef 3192 lp_build_polynomial(struct lp_build_context *bld, 3193 LLVMValueRef x, 3194 const double *coeffs, 3195 unsigned num_coeffs) 3196 { 3197 const struct lp_type type = bld->type; 3198 LLVMValueRef even = NULL, odd = NULL; 3199 LLVMValueRef x2; 3200 unsigned i; 3201 3202 assert(lp_check_value(bld->type, x)); 3203 3204 /* TODO: optimize the constant case */ 3205 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3206 LLVMIsConstant(x)) { 3207 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3208 __FUNCTION__); 3209 } 3210 3211 /* 3212 * Calculate odd and even terms seperately to decrease data dependency 3213 * Ex: 3214 * c[0] + x^2 * c[2] + x^4 * c[4] ... 3215 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 3216 */ 3217 x2 = lp_build_mul(bld, x, x); 3218 3219 for (i = num_coeffs; i--; ) { 3220 LLVMValueRef coeff; 3221 3222 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 3223 3224 if (i % 2 == 0) { 3225 if (even) 3226 even = lp_build_mad(bld, x2, even, coeff); 3227 else 3228 even = coeff; 3229 } else { 3230 if (odd) 3231 odd = lp_build_mad(bld, x2, odd, coeff); 3232 else 3233 odd = coeff; 3234 } 3235 } 3236 3237 if (odd) 3238 return lp_build_mad(bld, odd, x, even); 3239 else if (even) 3240 return even; 3241 else 3242 return bld->undef; 3243 } 3244 3245 3246 /** 3247 * Minimax polynomial fit of 2**x, in range [0, 1[ 3248 */ 3249 const double lp_build_exp2_polynomial[] = { 3250 #if EXP_POLY_DEGREE == 5 3251 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */ 3252 0.693153073200168932794, 3253 0.240153617044375388211, 3254 0.0558263180532956664775, 3255 0.00898934009049466391101, 3256 0.00187757667519147912699 3257 #elif EXP_POLY_DEGREE == 4 3258 1.00000259337069434683, 3259 0.693003834469974940458, 3260 0.24144275689150793076, 3261 0.0520114606103070150235, 3262 0.0135341679161270268764 3263 #elif EXP_POLY_DEGREE == 3 3264 0.999925218562710312959, 3265 0.695833540494823811697, 3266 0.226067155427249155588, 3267 0.0780245226406372992967 3268 #elif EXP_POLY_DEGREE == 2 3269 1.00172476321474503578, 3270 0.657636275736077639316, 3271 0.33718943461968720704 3272 #else 3273 #error 3274 #endif 3275 }; 3276 3277 3278 LLVMValueRef 3279 lp_build_exp2(struct lp_build_context *bld, 3280 LLVMValueRef x) 3281 { 3282 LLVMBuilderRef builder = bld->gallivm->builder; 3283 const struct lp_type type = bld->type; 3284 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3285 LLVMValueRef ipart = NULL; 3286 LLVMValueRef fpart = NULL; 3287 LLVMValueRef expipart = NULL; 3288 LLVMValueRef expfpart = NULL; 3289 LLVMValueRef res = NULL; 3290 3291 assert(lp_check_value(bld->type, x)); 3292 3293 /* TODO: optimize the constant case */ 3294 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3295 LLVMIsConstant(x)) { 3296 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3297 __FUNCTION__); 3298 } 3299 3300 assert(type.floating && type.width == 32); 3301 3302 /* We want to preserve NaN and make sure than for exp2 if x > 128, 3303 * the result is INF and if it's smaller than -126.9 the result is 0 */ 3304 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x, 3305 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3306 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), 3307 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3308 3309 /* ipart = floor(x) */ 3310 /* fpart = x - ipart */ 3311 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 3312 3313 /* expipart = (float) (1 << ipart) */ 3314 expipart = LLVMBuildAdd(builder, ipart, 3315 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3316 expipart = LLVMBuildShl(builder, expipart, 3317 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3318 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 3319 3320 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 3321 ARRAY_SIZE(lp_build_exp2_polynomial)); 3322 3323 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 3324 3325 return res; 3326 } 3327 3328 3329 3330 /** 3331 * Extract the exponent of a IEEE-754 floating point value. 3332 * 3333 * Optionally apply an integer bias. 3334 * 3335 * Result is an integer value with 3336 * 3337 * ifloor(log2(x)) + bias 3338 */ 3339 LLVMValueRef 3340 lp_build_extract_exponent(struct lp_build_context *bld, 3341 LLVMValueRef x, 3342 int bias) 3343 { 3344 LLVMBuilderRef builder = bld->gallivm->builder; 3345 const struct lp_type type = bld->type; 3346 unsigned mantissa = lp_mantissa(type); 3347 LLVMValueRef res; 3348 3349 assert(type.floating); 3350 3351 assert(lp_check_value(bld->type, x)); 3352 3353 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3354 3355 res = LLVMBuildLShr(builder, x, 3356 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 3357 res = LLVMBuildAnd(builder, res, 3358 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 3359 res = LLVMBuildSub(builder, res, 3360 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 3361 3362 return res; 3363 } 3364 3365 3366 /** 3367 * Extract the mantissa of the a floating. 3368 * 3369 * Result is a floating point value with 3370 * 3371 * x / floor(log2(x)) 3372 */ 3373 LLVMValueRef 3374 lp_build_extract_mantissa(struct lp_build_context *bld, 3375 LLVMValueRef x) 3376 { 3377 LLVMBuilderRef builder = bld->gallivm->builder; 3378 const struct lp_type type = bld->type; 3379 unsigned mantissa = lp_mantissa(type); 3380 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 3381 (1ULL << mantissa) - 1); 3382 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 3383 LLVMValueRef res; 3384 3385 assert(lp_check_value(bld->type, x)); 3386 3387 assert(type.floating); 3388 3389 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3390 3391 /* res = x / 2**ipart */ 3392 res = LLVMBuildAnd(builder, x, mantmask, ""); 3393 res = LLVMBuildOr(builder, res, one, ""); 3394 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 3395 3396 return res; 3397 } 3398 3399 3400 3401 /** 3402 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 3403 * These coefficients can be generate with 3404 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 3405 */ 3406 const double lp_build_log2_polynomial[] = { 3407 #if LOG_POLY_DEGREE == 5 3408 2.88539008148777786488L, 3409 0.961796878841293367824L, 3410 0.577058946784739859012L, 3411 0.412914355135828735411L, 3412 0.308591899232910175289L, 3413 0.352376952300281371868L, 3414 #elif LOG_POLY_DEGREE == 4 3415 2.88539009343309178325L, 3416 0.961791550404184197881L, 3417 0.577440339438736392009L, 3418 0.403343858251329912514L, 3419 0.406718052498846252698L, 3420 #elif LOG_POLY_DEGREE == 3 3421 2.88538959748872753838L, 3422 0.961932915889597772928L, 3423 0.571118517972136195241L, 3424 0.493997535084709500285L, 3425 #else 3426 #error 3427 #endif 3428 }; 3429 3430 /** 3431 * See http://www.devmaster.net/forums/showthread.php?p=43580 3432 * http://en.wikipedia.org/wiki/Logarithm#Calculation 3433 * http://www.nezumi.demon.co.uk/consult/logx.htm 3434 * 3435 * If handle_edge_cases is true the function will perform computations 3436 * to match the required D3D10+ behavior for each of the edge cases. 3437 * That means that if input is: 3438 * - less than zero (to and including -inf) then NaN will be returned 3439 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned 3440 * - +infinity, then +infinity will be returned 3441 * - NaN, then NaN will be returned 3442 * 3443 * Those checks are fairly expensive so if you don't need them make sure 3444 * handle_edge_cases is false. 3445 */ 3446 void 3447 lp_build_log2_approx(struct lp_build_context *bld, 3448 LLVMValueRef x, 3449 LLVMValueRef *p_exp, 3450 LLVMValueRef *p_floor_log2, 3451 LLVMValueRef *p_log2, 3452 boolean handle_edge_cases) 3453 { 3454 LLVMBuilderRef builder = bld->gallivm->builder; 3455 const struct lp_type type = bld->type; 3456 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3457 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 3458 3459 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 3460 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 3461 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 3462 3463 LLVMValueRef i = NULL; 3464 LLVMValueRef y = NULL; 3465 LLVMValueRef z = NULL; 3466 LLVMValueRef exp = NULL; 3467 LLVMValueRef mant = NULL; 3468 LLVMValueRef logexp = NULL; 3469 LLVMValueRef p_z = NULL; 3470 LLVMValueRef res = NULL; 3471 3472 assert(lp_check_value(bld->type, x)); 3473 3474 if(p_exp || p_floor_log2 || p_log2) { 3475 /* TODO: optimize the constant case */ 3476 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3477 LLVMIsConstant(x)) { 3478 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3479 __FUNCTION__); 3480 } 3481 3482 assert(type.floating && type.width == 32); 3483 3484 /* 3485 * We don't explicitly handle denormalized numbers. They will yield a 3486 * result in the neighbourhood of -127, which appears to be adequate 3487 * enough. 3488 */ 3489 3490 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3491 3492 /* exp = (float) exponent(x) */ 3493 exp = LLVMBuildAnd(builder, i, expmask, ""); 3494 } 3495 3496 if(p_floor_log2 || p_log2) { 3497 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3498 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3499 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 3500 } 3501 3502 if (p_log2) { 3503 /* mant = 1 + (float) mantissa(x) */ 3504 mant = LLVMBuildAnd(builder, i, mantmask, ""); 3505 mant = LLVMBuildOr(builder, mant, one, ""); 3506 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 3507 3508 /* y = (mant - 1) / (mant + 1) */ 3509 y = lp_build_div(bld, 3510 lp_build_sub(bld, mant, bld->one), 3511 lp_build_add(bld, mant, bld->one) 3512 ); 3513 3514 /* z = y^2 */ 3515 z = lp_build_mul(bld, y, y); 3516 3517 /* compute P(z) */ 3518 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 3519 ARRAY_SIZE(lp_build_log2_polynomial)); 3520 3521 /* y * P(z) + logexp */ 3522 res = lp_build_mad(bld, y, p_z, logexp); 3523 3524 if (type.floating && handle_edge_cases) { 3525 LLVMValueRef negmask, infmask, zmask; 3526 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x, 3527 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3528 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, 3529 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3530 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x, 3531 lp_build_const_vec(bld->gallivm, type, INFINITY)); 3532 3533 /* If x is qual to inf make sure we return inf */ 3534 res = lp_build_select(bld, infmask, 3535 lp_build_const_vec(bld->gallivm, type, INFINITY), 3536 res); 3537 /* If x is qual to 0, return -inf */ 3538 res = lp_build_select(bld, zmask, 3539 lp_build_const_vec(bld->gallivm, type, -INFINITY), 3540 res); 3541 /* If x is nan or less than 0, return nan */ 3542 res = lp_build_select(bld, negmask, 3543 lp_build_const_vec(bld->gallivm, type, NAN), 3544 res); 3545 } 3546 } 3547 3548 if (p_exp) { 3549 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 3550 *p_exp = exp; 3551 } 3552 3553 if (p_floor_log2) 3554 *p_floor_log2 = logexp; 3555 3556 if (p_log2) 3557 *p_log2 = res; 3558 } 3559 3560 3561 /* 3562 * log2 implementation which doesn't have special code to 3563 * handle edge cases (-inf, 0, inf, NaN). It's faster but 3564 * the results for those cases are undefined. 3565 */ 3566 LLVMValueRef 3567 lp_build_log2(struct lp_build_context *bld, 3568 LLVMValueRef x) 3569 { 3570 LLVMValueRef res; 3571 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE); 3572 return res; 3573 } 3574 3575 /* 3576 * Version of log2 which handles all edge cases. 3577 * Look at documentation of lp_build_log2_approx for 3578 * description of the behavior for each of the edge cases. 3579 */ 3580 LLVMValueRef 3581 lp_build_log2_safe(struct lp_build_context *bld, 3582 LLVMValueRef x) 3583 { 3584 LLVMValueRef res; 3585 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE); 3586 return res; 3587 } 3588 3589 3590 /** 3591 * Faster (and less accurate) log2. 3592 * 3593 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 3594 * 3595 * Piece-wise linear approximation, with exact results when x is a 3596 * power of two. 3597 * 3598 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 3599 */ 3600 LLVMValueRef 3601 lp_build_fast_log2(struct lp_build_context *bld, 3602 LLVMValueRef x) 3603 { 3604 LLVMBuilderRef builder = bld->gallivm->builder; 3605 LLVMValueRef ipart; 3606 LLVMValueRef fpart; 3607 3608 assert(lp_check_value(bld->type, x)); 3609 3610 assert(bld->type.floating); 3611 3612 /* ipart = floor(log2(x)) - 1 */ 3613 ipart = lp_build_extract_exponent(bld, x, -1); 3614 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 3615 3616 /* fpart = x / 2**ipart */ 3617 fpart = lp_build_extract_mantissa(bld, x); 3618 3619 /* ipart + fpart */ 3620 return LLVMBuildFAdd(builder, ipart, fpart, ""); 3621 } 3622 3623 3624 /** 3625 * Fast implementation of iround(log2(x)). 3626 * 3627 * Not an approximation -- it should give accurate results all the time. 3628 */ 3629 LLVMValueRef 3630 lp_build_ilog2(struct lp_build_context *bld, 3631 LLVMValueRef x) 3632 { 3633 LLVMBuilderRef builder = bld->gallivm->builder; 3634 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 3635 LLVMValueRef ipart; 3636 3637 assert(bld->type.floating); 3638 3639 assert(lp_check_value(bld->type, x)); 3640 3641 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 3642 x = LLVMBuildFMul(builder, x, sqrt2, ""); 3643 3644 /* ipart = floor(log2(x) + 0.5) */ 3645 ipart = lp_build_extract_exponent(bld, x, 0); 3646 3647 return ipart; 3648 } 3649 3650 LLVMValueRef 3651 lp_build_mod(struct lp_build_context *bld, 3652 LLVMValueRef x, 3653 LLVMValueRef y) 3654 { 3655 LLVMBuilderRef builder = bld->gallivm->builder; 3656 LLVMValueRef res; 3657 const struct lp_type type = bld->type; 3658 3659 assert(lp_check_value(type, x)); 3660 assert(lp_check_value(type, y)); 3661 3662 if (type.floating) 3663 res = LLVMBuildFRem(builder, x, y, ""); 3664 else if (type.sign) 3665 res = LLVMBuildSRem(builder, x, y, ""); 3666 else 3667 res = LLVMBuildURem(builder, x, y, ""); 3668 return res; 3669 } 3670 3671 3672 /* 3673 * For floating inputs it creates and returns a mask 3674 * which is all 1's for channels which are NaN. 3675 * Channels inside x which are not NaN will be 0. 3676 */ 3677 LLVMValueRef 3678 lp_build_isnan(struct lp_build_context *bld, 3679 LLVMValueRef x) 3680 { 3681 LLVMValueRef mask; 3682 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3683 3684 assert(bld->type.floating); 3685 assert(lp_check_value(bld->type, x)); 3686 3687 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x, 3688 "isnotnan"); 3689 mask = LLVMBuildNot(bld->gallivm->builder, mask, ""); 3690 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan"); 3691 return mask; 3692 } 3693 3694 /* Returns all 1's for floating point numbers that are 3695 * finite numbers and returns all zeros for -inf, 3696 * inf and nan's */ 3697 LLVMValueRef 3698 lp_build_isfinite(struct lp_build_context *bld, 3699 LLVMValueRef x) 3700 { 3701 LLVMBuilderRef builder = bld->gallivm->builder; 3702 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3703 struct lp_type int_type = lp_int_type(bld->type); 3704 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3705 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type, 3706 0x7f800000); 3707 3708 if (!bld->type.floating) { 3709 return lp_build_const_int_vec(bld->gallivm, bld->type, 0); 3710 } 3711 assert(bld->type.floating); 3712 assert(lp_check_value(bld->type, x)); 3713 assert(bld->type.width == 32); 3714 3715 intx = LLVMBuildAnd(builder, intx, infornan32, ""); 3716 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL, 3717 intx, infornan32); 3718 } 3719 3720 /* 3721 * Returns true if the number is nan or inf and false otherwise. 3722 * The input has to be a floating point vector. 3723 */ 3724 LLVMValueRef 3725 lp_build_is_inf_or_nan(struct gallivm_state *gallivm, 3726 const struct lp_type type, 3727 LLVMValueRef x) 3728 { 3729 LLVMBuilderRef builder = gallivm->builder; 3730 struct lp_type int_type = lp_int_type(type); 3731 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type, 3732 0x7f800000); 3733 LLVMValueRef ret; 3734 3735 assert(type.floating); 3736 3737 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), ""); 3738 ret = LLVMBuildAnd(builder, ret, const0, ""); 3739 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL, 3740 ret, const0); 3741 3742 return ret; 3743 } 3744 3745 3746 LLVMValueRef 3747 lp_build_fpstate_get(struct gallivm_state *gallivm) 3748 { 3749 if (util_cpu_caps.has_sse) { 3750 LLVMBuilderRef builder = gallivm->builder; 3751 LLVMValueRef mxcsr_ptr = lp_build_alloca( 3752 gallivm, 3753 LLVMInt32TypeInContext(gallivm->context), 3754 "mxcsr_ptr"); 3755 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr, 3756 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3757 lp_build_intrinsic(builder, 3758 "llvm.x86.sse.stmxcsr", 3759 LLVMVoidTypeInContext(gallivm->context), 3760 &mxcsr_ptr8, 1, 0); 3761 return mxcsr_ptr; 3762 } 3763 return 0; 3764 } 3765 3766 void 3767 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, 3768 boolean zero) 3769 { 3770 if (util_cpu_caps.has_sse) { 3771 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ 3772 int daz_ftz = _MM_FLUSH_ZERO_MASK; 3773 3774 LLVMBuilderRef builder = gallivm->builder; 3775 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm); 3776 LLVMValueRef mxcsr = 3777 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr"); 3778 3779 if (util_cpu_caps.has_daz) { 3780 /* Enable denormals are zero mode */ 3781 daz_ftz |= _MM_DENORMALS_ZERO_MASK; 3782 } 3783 if (zero) { 3784 mxcsr = LLVMBuildOr(builder, mxcsr, 3785 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), ""); 3786 } else { 3787 mxcsr = LLVMBuildAnd(builder, mxcsr, 3788 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), ""); 3789 } 3790 3791 LLVMBuildStore(builder, mxcsr, mxcsr_ptr); 3792 lp_build_fpstate_set(gallivm, mxcsr_ptr); 3793 } 3794 } 3795 3796 void 3797 lp_build_fpstate_set(struct gallivm_state *gallivm, 3798 LLVMValueRef mxcsr_ptr) 3799 { 3800 if (util_cpu_caps.has_sse) { 3801 LLVMBuilderRef builder = gallivm->builder; 3802 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, 3803 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3804 lp_build_intrinsic(builder, 3805 "llvm.x86.sse.ldmxcsr", 3806 LLVMVoidTypeInContext(gallivm->context), 3807 &mxcsr_ptr, 1, 0); 3808 } 3809 } 3810