1 /************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 /** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca (at) vmware.com> 45 */ 46 47 48 #include <float.h> 49 50 #include "util/u_memory.h" 51 #include "util/u_debug.h" 52 #include "util/u_math.h" 53 #include "util/u_cpu_detect.h" 54 55 #include "lp_bld_type.h" 56 #include "lp_bld_const.h" 57 #include "lp_bld_init.h" 58 #include "lp_bld_intr.h" 59 #include "lp_bld_logic.h" 60 #include "lp_bld_pack.h" 61 #include "lp_bld_debug.h" 62 #include "lp_bld_bitarit.h" 63 #include "lp_bld_arit.h" 64 #include "lp_bld_flow.h" 65 66 #if defined(PIPE_ARCH_SSE) 67 #include <xmmintrin.h> 68 #endif 69 70 #ifndef _MM_DENORMALS_ZERO_MASK 71 #define _MM_DENORMALS_ZERO_MASK 0x0040 72 #endif 73 74 #ifndef _MM_FLUSH_ZERO_MASK 75 #define _MM_FLUSH_ZERO_MASK 0x8000 76 #endif 77 78 #define EXP_POLY_DEGREE 5 79 80 #define LOG_POLY_DEGREE 4 81 82 83 /** 84 * Generate min(a, b) 85 * No checks for special case values of a or b = 1 or 0 are done. 86 * NaN's are handled according to the behavior specified by the 87 * nan_behavior argument. 88 */ 89 static LLVMValueRef 90 lp_build_min_simple(struct lp_build_context *bld, 91 LLVMValueRef a, 92 LLVMValueRef b, 93 enum gallivm_nan_behavior nan_behavior) 94 { 95 const struct lp_type type = bld->type; 96 const char *intrinsic = NULL; 97 unsigned intr_size = 0; 98 LLVMValueRef cond; 99 100 assert(lp_check_value(type, a)); 101 assert(lp_check_value(type, b)); 102 103 /* TODO: optimize the constant case */ 104 105 if (type.floating && util_cpu_caps.has_sse) { 106 if (type.width == 32) { 107 if (type.length == 1) { 108 intrinsic = "llvm.x86.sse.min.ss"; 109 intr_size = 128; 110 } 111 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 112 intrinsic = "llvm.x86.sse.min.ps"; 113 intr_size = 128; 114 } 115 else { 116 intrinsic = "llvm.x86.avx.min.ps.256"; 117 intr_size = 256; 118 } 119 } 120 if (type.width == 64 && util_cpu_caps.has_sse2) { 121 if (type.length == 1) { 122 intrinsic = "llvm.x86.sse2.min.sd"; 123 intr_size = 128; 124 } 125 else if (type.length == 2 || !util_cpu_caps.has_avx) { 126 intrinsic = "llvm.x86.sse2.min.pd"; 127 intr_size = 128; 128 } 129 else { 130 intrinsic = "llvm.x86.avx.min.pd.256"; 131 intr_size = 256; 132 } 133 } 134 } 135 else if (type.floating && util_cpu_caps.has_altivec) { 136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN || 137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 138 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 139 __FUNCTION__); 140 } 141 if (type.width == 32 && type.length == 4) { 142 intrinsic = "llvm.ppc.altivec.vminfp"; 143 intr_size = 128; 144 } 145 } else if (HAVE_LLVM < 0x0309 && 146 util_cpu_caps.has_avx2 && type.length > 4) { 147 intr_size = 256; 148 switch (type.width) { 149 case 8: 150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b"; 151 break; 152 case 16: 153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w"; 154 break; 155 case 32: 156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d"; 157 break; 158 } 159 } else if (HAVE_LLVM < 0x0309 && 160 util_cpu_caps.has_sse2 && type.length >= 2) { 161 intr_size = 128; 162 if ((type.width == 8 || type.width == 16) && 163 (type.width * type.length <= 64) && 164 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 166 __FUNCTION__); 167 } 168 if (type.width == 8 && !type.sign) { 169 intrinsic = "llvm.x86.sse2.pminu.b"; 170 } 171 else if (type.width == 16 && type.sign) { 172 intrinsic = "llvm.x86.sse2.pmins.w"; 173 } 174 if (util_cpu_caps.has_sse4_1) { 175 if (type.width == 8 && type.sign) { 176 intrinsic = "llvm.x86.sse41.pminsb"; 177 } 178 if (type.width == 16 && !type.sign) { 179 intrinsic = "llvm.x86.sse41.pminuw"; 180 } 181 if (type.width == 32 && !type.sign) { 182 intrinsic = "llvm.x86.sse41.pminud"; 183 } 184 if (type.width == 32 && type.sign) { 185 intrinsic = "llvm.x86.sse41.pminsd"; 186 } 187 } 188 } else if (util_cpu_caps.has_altivec) { 189 intr_size = 128; 190 if (type.width == 8) { 191 if (!type.sign) { 192 intrinsic = "llvm.ppc.altivec.vminub"; 193 } else { 194 intrinsic = "llvm.ppc.altivec.vminsb"; 195 } 196 } else if (type.width == 16) { 197 if (!type.sign) { 198 intrinsic = "llvm.ppc.altivec.vminuh"; 199 } else { 200 intrinsic = "llvm.ppc.altivec.vminsh"; 201 } 202 } else if (type.width == 32) { 203 if (!type.sign) { 204 intrinsic = "llvm.ppc.altivec.vminuw"; 205 } else { 206 intrinsic = "llvm.ppc.altivec.vminsw"; 207 } 208 } 209 } 210 211 if (intrinsic) { 212 /* We need to handle nan's for floating point numbers. If one of the 213 * inputs is nan the other should be returned (required by both D3D10+ 214 * and OpenCL). 215 * The sse intrinsics return the second operator in case of nan by 216 * default so we need to special code to handle those. 217 */ 218 if (util_cpu_caps.has_sse && type.floating && 219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && 220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && 221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 222 LLVMValueRef isnan, min; 223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 224 type, 225 intr_size, a, b); 226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 227 isnan = lp_build_isnan(bld, b); 228 return lp_build_select(bld, isnan, a, min); 229 } else { 230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN); 231 isnan = lp_build_isnan(bld, a); 232 return lp_build_select(bld, isnan, a, min); 233 } 234 } else { 235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 236 type, 237 intr_size, a, b); 238 } 239 } 240 241 if (type.floating) { 242 switch (nan_behavior) { 243 case GALLIVM_NAN_RETURN_NAN: { 244 LLVMValueRef isnan = lp_build_isnan(bld, b); 245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 247 return lp_build_select(bld, cond, a, b); 248 } 249 break; 250 case GALLIVM_NAN_RETURN_OTHER: { 251 LLVMValueRef isnan = lp_build_isnan(bld, a); 252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 254 return lp_build_select(bld, cond, a, b); 255 } 256 break; 257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b); 259 return lp_build_select(bld, cond, a, b); 260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a); 262 return lp_build_select(bld, cond, b, a); 263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 265 return lp_build_select(bld, cond, a, b); 266 break; 267 default: 268 assert(0); 269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 270 return lp_build_select(bld, cond, a, b); 271 } 272 } else { 273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 274 return lp_build_select(bld, cond, a, b); 275 } 276 } 277 278 279 LLVMValueRef 280 lp_build_fmuladd(LLVMBuilderRef builder, 281 LLVMValueRef a, 282 LLVMValueRef b, 283 LLVMValueRef c) 284 { 285 LLVMTypeRef type = LLVMTypeOf(a); 286 assert(type == LLVMTypeOf(b)); 287 assert(type == LLVMTypeOf(c)); 288 if (HAVE_LLVM < 0x0304) { 289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is 290 * not supported, and instead it falls-back to a C function. 291 */ 292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, ""); 293 } 294 char intrinsic[32]; 295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type); 296 LLVMValueRef args[] = { a, b, c }; 297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0); 298 } 299 300 301 /** 302 * Generate max(a, b) 303 * No checks for special case values of a or b = 1 or 0 are done. 304 * NaN's are handled according to the behavior specified by the 305 * nan_behavior argument. 306 */ 307 static LLVMValueRef 308 lp_build_max_simple(struct lp_build_context *bld, 309 LLVMValueRef a, 310 LLVMValueRef b, 311 enum gallivm_nan_behavior nan_behavior) 312 { 313 const struct lp_type type = bld->type; 314 const char *intrinsic = NULL; 315 unsigned intr_size = 0; 316 LLVMValueRef cond; 317 318 assert(lp_check_value(type, a)); 319 assert(lp_check_value(type, b)); 320 321 /* TODO: optimize the constant case */ 322 323 if (type.floating && util_cpu_caps.has_sse) { 324 if (type.width == 32) { 325 if (type.length == 1) { 326 intrinsic = "llvm.x86.sse.max.ss"; 327 intr_size = 128; 328 } 329 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 330 intrinsic = "llvm.x86.sse.max.ps"; 331 intr_size = 128; 332 } 333 else { 334 intrinsic = "llvm.x86.avx.max.ps.256"; 335 intr_size = 256; 336 } 337 } 338 if (type.width == 64 && util_cpu_caps.has_sse2) { 339 if (type.length == 1) { 340 intrinsic = "llvm.x86.sse2.max.sd"; 341 intr_size = 128; 342 } 343 else if (type.length == 2 || !util_cpu_caps.has_avx) { 344 intrinsic = "llvm.x86.sse2.max.pd"; 345 intr_size = 128; 346 } 347 else { 348 intrinsic = "llvm.x86.avx.max.pd.256"; 349 intr_size = 256; 350 } 351 } 352 } 353 else if (type.floating && util_cpu_caps.has_altivec) { 354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN || 355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 356 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 357 __FUNCTION__); 358 } 359 if (type.width == 32 || type.length == 4) { 360 intrinsic = "llvm.ppc.altivec.vmaxfp"; 361 intr_size = 128; 362 } 363 } else if (HAVE_LLVM < 0x0309 && 364 util_cpu_caps.has_avx2 && type.length > 4) { 365 intr_size = 256; 366 switch (type.width) { 367 case 8: 368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b"; 369 break; 370 case 16: 371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w"; 372 break; 373 case 32: 374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d"; 375 break; 376 } 377 } else if (HAVE_LLVM < 0x0309 && 378 util_cpu_caps.has_sse2 && type.length >= 2) { 379 intr_size = 128; 380 if ((type.width == 8 || type.width == 16) && 381 (type.width * type.length <= 64) && 382 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 384 __FUNCTION__); 385 } 386 if (type.width == 8 && !type.sign) { 387 intrinsic = "llvm.x86.sse2.pmaxu.b"; 388 intr_size = 128; 389 } 390 else if (type.width == 16 && type.sign) { 391 intrinsic = "llvm.x86.sse2.pmaxs.w"; 392 } 393 if (util_cpu_caps.has_sse4_1) { 394 if (type.width == 8 && type.sign) { 395 intrinsic = "llvm.x86.sse41.pmaxsb"; 396 } 397 if (type.width == 16 && !type.sign) { 398 intrinsic = "llvm.x86.sse41.pmaxuw"; 399 } 400 if (type.width == 32 && !type.sign) { 401 intrinsic = "llvm.x86.sse41.pmaxud"; 402 } 403 if (type.width == 32 && type.sign) { 404 intrinsic = "llvm.x86.sse41.pmaxsd"; 405 } 406 } 407 } else if (util_cpu_caps.has_altivec) { 408 intr_size = 128; 409 if (type.width == 8) { 410 if (!type.sign) { 411 intrinsic = "llvm.ppc.altivec.vmaxub"; 412 } else { 413 intrinsic = "llvm.ppc.altivec.vmaxsb"; 414 } 415 } else if (type.width == 16) { 416 if (!type.sign) { 417 intrinsic = "llvm.ppc.altivec.vmaxuh"; 418 } else { 419 intrinsic = "llvm.ppc.altivec.vmaxsh"; 420 } 421 } else if (type.width == 32) { 422 if (!type.sign) { 423 intrinsic = "llvm.ppc.altivec.vmaxuw"; 424 } else { 425 intrinsic = "llvm.ppc.altivec.vmaxsw"; 426 } 427 } 428 } 429 430 if (intrinsic) { 431 if (util_cpu_caps.has_sse && type.floating && 432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && 433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && 434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 435 LLVMValueRef isnan, max; 436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 437 type, 438 intr_size, a, b); 439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 440 isnan = lp_build_isnan(bld, b); 441 return lp_build_select(bld, isnan, a, max); 442 } else { 443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN); 444 isnan = lp_build_isnan(bld, a); 445 return lp_build_select(bld, isnan, a, max); 446 } 447 } else { 448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 449 type, 450 intr_size, a, b); 451 } 452 } 453 454 if (type.floating) { 455 switch (nan_behavior) { 456 case GALLIVM_NAN_RETURN_NAN: { 457 LLVMValueRef isnan = lp_build_isnan(bld, b); 458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 460 return lp_build_select(bld, cond, a, b); 461 } 462 break; 463 case GALLIVM_NAN_RETURN_OTHER: { 464 LLVMValueRef isnan = lp_build_isnan(bld, a); 465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 467 return lp_build_select(bld, cond, a, b); 468 } 469 break; 470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b); 472 return lp_build_select(bld, cond, a, b); 473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a); 475 return lp_build_select(bld, cond, b, a); 476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 478 return lp_build_select(bld, cond, a, b); 479 break; 480 default: 481 assert(0); 482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 483 return lp_build_select(bld, cond, a, b); 484 } 485 } else { 486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 487 return lp_build_select(bld, cond, a, b); 488 } 489 } 490 491 492 /** 493 * Generate 1 - a, or ~a depending on bld->type. 494 */ 495 LLVMValueRef 496 lp_build_comp(struct lp_build_context *bld, 497 LLVMValueRef a) 498 { 499 LLVMBuilderRef builder = bld->gallivm->builder; 500 const struct lp_type type = bld->type; 501 502 assert(lp_check_value(type, a)); 503 504 if(a == bld->one) 505 return bld->zero; 506 if(a == bld->zero) 507 return bld->one; 508 509 if(type.norm && !type.floating && !type.fixed && !type.sign) { 510 if(LLVMIsConstant(a)) 511 return LLVMConstNot(a); 512 else 513 return LLVMBuildNot(builder, a, ""); 514 } 515 516 if(LLVMIsConstant(a)) 517 if (type.floating) 518 return LLVMConstFSub(bld->one, a); 519 else 520 return LLVMConstSub(bld->one, a); 521 else 522 if (type.floating) 523 return LLVMBuildFSub(builder, bld->one, a, ""); 524 else 525 return LLVMBuildSub(builder, bld->one, a, ""); 526 } 527 528 529 /** 530 * Generate a + b 531 */ 532 LLVMValueRef 533 lp_build_add(struct lp_build_context *bld, 534 LLVMValueRef a, 535 LLVMValueRef b) 536 { 537 LLVMBuilderRef builder = bld->gallivm->builder; 538 const struct lp_type type = bld->type; 539 LLVMValueRef res; 540 541 assert(lp_check_value(type, a)); 542 assert(lp_check_value(type, b)); 543 544 if(a == bld->zero) 545 return b; 546 if(b == bld->zero) 547 return a; 548 if(a == bld->undef || b == bld->undef) 549 return bld->undef; 550 551 if(bld->type.norm) { 552 const char *intrinsic = NULL; 553 554 if(a == bld->one || b == bld->one) 555 return bld->one; 556 557 if (!type.floating && !type.fixed) { 558 if (type.width * type.length == 128) { 559 if(util_cpu_caps.has_sse2) { 560 if(type.width == 8) 561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 562 if(type.width == 16) 563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 564 } else if (util_cpu_caps.has_altivec) { 565 if(type.width == 8) 566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; 567 if(type.width == 16) 568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; 569 } 570 } 571 if (type.width * type.length == 256) { 572 if(util_cpu_caps.has_avx2) { 573 if(type.width == 8) 574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; 575 if(type.width == 16) 576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; 577 } 578 } 579 } 580 581 if (intrinsic) 582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 583 } 584 585 if(type.norm && !type.floating && !type.fixed) { 586 if (type.sign) { 587 uint64_t sign = (uint64_t)1 << (type.width - 1); 588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 590 /* a_clamp_max is the maximum a for positive b, 591 a_clamp_min is the minimum a for negative b. */ 592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min); 595 } else { 596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 597 } 598 } 599 600 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 601 if (type.floating) 602 res = LLVMConstFAdd(a, b); 603 else 604 res = LLVMConstAdd(a, b); 605 else 606 if (type.floating) 607 res = LLVMBuildFAdd(builder, a, b, ""); 608 else 609 res = LLVMBuildAdd(builder, a, b, ""); 610 611 /* clamp to ceiling of 1.0 */ 612 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 614 615 /* XXX clamp to floor of -1 or 0??? */ 616 617 return res; 618 } 619 620 621 /** Return the scalar sum of the elements of a. 622 * Should avoid this operation whenever possible. 623 */ 624 LLVMValueRef 625 lp_build_horizontal_add(struct lp_build_context *bld, 626 LLVMValueRef a) 627 { 628 LLVMBuilderRef builder = bld->gallivm->builder; 629 const struct lp_type type = bld->type; 630 LLVMValueRef index, res; 631 unsigned i, length; 632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 634 LLVMValueRef vecres, elem2; 635 636 assert(lp_check_value(type, a)); 637 638 if (type.length == 1) { 639 return a; 640 } 641 642 assert(!bld->type.norm); 643 644 /* 645 * for byte vectors can do much better with psadbw. 646 * Using repeated shuffle/adds here. Note with multiple vectors 647 * this can be done more efficiently as outlined in the intel 648 * optimization manual. 649 * Note: could cause data rearrangement if used with smaller element 650 * sizes. 651 */ 652 653 vecres = a; 654 length = type.length / 2; 655 while (length > 1) { 656 LLVMValueRef vec1, vec2; 657 for (i = 0; i < length; i++) { 658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 660 } 661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 662 LLVMConstVector(shuffles1, length), ""); 663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 664 LLVMConstVector(shuffles2, length), ""); 665 if (type.floating) { 666 vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 667 } 668 else { 669 vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 670 } 671 length = length >> 1; 672 } 673 674 /* always have vector of size 2 here */ 675 assert(length == 1); 676 677 index = lp_build_const_int32(bld->gallivm, 0); 678 res = LLVMBuildExtractElement(builder, vecres, index, ""); 679 index = lp_build_const_int32(bld->gallivm, 1); 680 elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 681 682 if (type.floating) 683 res = LLVMBuildFAdd(builder, res, elem2, ""); 684 else 685 res = LLVMBuildAdd(builder, res, elem2, ""); 686 687 return res; 688 } 689 690 /** 691 * Return the horizontal sums of 4 float vectors as a float4 vector. 692 * This uses the technique as outlined in Intel Optimization Manual. 693 */ 694 static LLVMValueRef 695 lp_build_horizontal_add4x4f(struct lp_build_context *bld, 696 LLVMValueRef src[4]) 697 { 698 struct gallivm_state *gallivm = bld->gallivm; 699 LLVMBuilderRef builder = gallivm->builder; 700 LLVMValueRef shuffles[4]; 701 LLVMValueRef tmp[4]; 702 LLVMValueRef sumtmp[2], shuftmp[2]; 703 704 /* lower half of regs */ 705 shuffles[0] = lp_build_const_int32(gallivm, 0); 706 shuffles[1] = lp_build_const_int32(gallivm, 1); 707 shuffles[2] = lp_build_const_int32(gallivm, 4); 708 shuffles[3] = lp_build_const_int32(gallivm, 5); 709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 710 LLVMConstVector(shuffles, 4), ""); 711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 712 LLVMConstVector(shuffles, 4), ""); 713 714 /* upper half of regs */ 715 shuffles[0] = lp_build_const_int32(gallivm, 2); 716 shuffles[1] = lp_build_const_int32(gallivm, 3); 717 shuffles[2] = lp_build_const_int32(gallivm, 6); 718 shuffles[3] = lp_build_const_int32(gallivm, 7); 719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 720 LLVMConstVector(shuffles, 4), ""); 721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 722 LLVMConstVector(shuffles, 4), ""); 723 724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 726 727 shuffles[0] = lp_build_const_int32(gallivm, 0); 728 shuffles[1] = lp_build_const_int32(gallivm, 2); 729 shuffles[2] = lp_build_const_int32(gallivm, 4); 730 shuffles[3] = lp_build_const_int32(gallivm, 6); 731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 732 LLVMConstVector(shuffles, 4), ""); 733 734 shuffles[0] = lp_build_const_int32(gallivm, 1); 735 shuffles[1] = lp_build_const_int32(gallivm, 3); 736 shuffles[2] = lp_build_const_int32(gallivm, 5); 737 shuffles[3] = lp_build_const_int32(gallivm, 7); 738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 739 LLVMConstVector(shuffles, 4), ""); 740 741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 742 } 743 744 745 /* 746 * partially horizontally add 2-4 float vectors with length nx4, 747 * i.e. only four adjacent values in each vector will be added, 748 * assuming values are really grouped in 4 which also determines 749 * output order. 750 * 751 * Return a vector of the same length as the initial vectors, 752 * with the excess elements (if any) being undefined. 753 * The element order is independent of number of input vectors. 754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 755 * the output order thus will be 756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 757 */ 758 LLVMValueRef 759 lp_build_hadd_partial4(struct lp_build_context *bld, 760 LLVMValueRef vectors[], 761 unsigned num_vecs) 762 { 763 struct gallivm_state *gallivm = bld->gallivm; 764 LLVMBuilderRef builder = gallivm->builder; 765 LLVMValueRef ret_vec; 766 LLVMValueRef tmp[4]; 767 const char *intrinsic = NULL; 768 769 assert(num_vecs >= 2 && num_vecs <= 4); 770 assert(bld->type.floating); 771 772 /* only use this with at least 2 vectors, as it is sort of expensive 773 * (depending on cpu) and we always need two horizontal adds anyway, 774 * so a shuffle/add approach might be better. 775 */ 776 777 tmp[0] = vectors[0]; 778 tmp[1] = vectors[1]; 779 780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 782 783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 && 784 bld->type.length == 4) { 785 intrinsic = "llvm.x86.sse3.hadd.ps"; 786 } 787 else if (util_cpu_caps.has_avx && bld->type.width == 32 && 788 bld->type.length == 8) { 789 intrinsic = "llvm.x86.avx.hadd.ps.256"; 790 } 791 if (intrinsic) { 792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 793 lp_build_vec_type(gallivm, bld->type), 794 tmp[0], tmp[1]); 795 if (num_vecs > 2) { 796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 797 lp_build_vec_type(gallivm, bld->type), 798 tmp[2], tmp[3]); 799 } 800 else { 801 tmp[1] = tmp[0]; 802 } 803 return lp_build_intrinsic_binary(builder, intrinsic, 804 lp_build_vec_type(gallivm, bld->type), 805 tmp[0], tmp[1]); 806 } 807 808 if (bld->type.length == 4) { 809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 810 } 811 else { 812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 813 unsigned j; 814 unsigned num_iter = bld->type.length / 4; 815 struct lp_type parttype = bld->type; 816 parttype.length = 4; 817 for (j = 0; j < num_iter; j++) { 818 LLVMValueRef partsrc[4]; 819 unsigned i; 820 for (i = 0; i < 4; i++) { 821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 822 } 823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 824 } 825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 826 } 827 return ret_vec; 828 } 829 830 /** 831 * Generate a - b 832 */ 833 LLVMValueRef 834 lp_build_sub(struct lp_build_context *bld, 835 LLVMValueRef a, 836 LLVMValueRef b) 837 { 838 LLVMBuilderRef builder = bld->gallivm->builder; 839 const struct lp_type type = bld->type; 840 LLVMValueRef res; 841 842 assert(lp_check_value(type, a)); 843 assert(lp_check_value(type, b)); 844 845 if(b == bld->zero) 846 return a; 847 if(a == bld->undef || b == bld->undef) 848 return bld->undef; 849 if(a == b) 850 return bld->zero; 851 852 if(bld->type.norm) { 853 const char *intrinsic = NULL; 854 855 if(b == bld->one) 856 return bld->zero; 857 858 if (!type.floating && !type.fixed) { 859 if (type.width * type.length == 128) { 860 if (util_cpu_caps.has_sse2) { 861 if(type.width == 8) 862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 863 if(type.width == 16) 864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 865 } else if (util_cpu_caps.has_altivec) { 866 if(type.width == 8) 867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; 868 if(type.width == 16) 869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; 870 } 871 } 872 if (type.width * type.length == 256) { 873 if (util_cpu_caps.has_avx2) { 874 if(type.width == 8) 875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; 876 if(type.width == 16) 877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; 878 } 879 } 880 } 881 882 if (intrinsic) 883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 884 } 885 886 if(type.norm && !type.floating && !type.fixed) { 887 if (type.sign) { 888 uint64_t sign = (uint64_t)1 << (type.width - 1); 889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 891 /* a_clamp_max is the maximum a for negative b, 892 a_clamp_min is the minimum a for positive b. */ 893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max); 896 } else { 897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 898 } 899 } 900 901 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 902 if (type.floating) 903 res = LLVMConstFSub(a, b); 904 else 905 res = LLVMConstSub(a, b); 906 else 907 if (type.floating) 908 res = LLVMBuildFSub(builder, a, b, ""); 909 else 910 res = LLVMBuildSub(builder, a, b, ""); 911 912 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 914 915 return res; 916 } 917 918 919 920 /** 921 * Normalized multiplication. 922 * 923 * There are several approaches for (using 8-bit normalized multiplication as 924 * an example): 925 * 926 * - alpha plus one 927 * 928 * makes the following approximation to the division (Sree) 929 * 930 * a*b/255 ~= (a*(b + 1)) >> 256 931 * 932 * which is the fastest method that satisfies the following OpenGL criteria of 933 * 934 * 0*0 = 0 and 255*255 = 255 935 * 936 * - geometric series 937 * 938 * takes the geometric series approximation to the division 939 * 940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 941 * 942 * in this case just the first two terms to fit in 16bit arithmetic 943 * 944 * t/255 ~= (t + (t >> 8)) >> 8 945 * 946 * note that just by itself it doesn't satisfies the OpenGL criteria, as 947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 948 * must be used. 949 * 950 * - geometric series plus rounding 951 * 952 * when using a geometric series division instead of truncating the result 953 * use roundoff in the approximation (Jim Blinn) 954 * 955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 956 * 957 * achieving the exact results. 958 * 959 * 960 * 961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 963 * @sa Michael Herf, The "double blend trick", May 2000, 964 * http://www.stereopsis.com/doubleblend.html 965 */ 966 static LLVMValueRef 967 lp_build_mul_norm(struct gallivm_state *gallivm, 968 struct lp_type wide_type, 969 LLVMValueRef a, LLVMValueRef b) 970 { 971 LLVMBuilderRef builder = gallivm->builder; 972 struct lp_build_context bld; 973 unsigned n; 974 LLVMValueRef half; 975 LLVMValueRef ab; 976 977 assert(!wide_type.floating); 978 assert(lp_check_value(wide_type, a)); 979 assert(lp_check_value(wide_type, b)); 980 981 lp_build_context_init(&bld, gallivm, wide_type); 982 983 n = wide_type.width / 2; 984 if (wide_type.sign) { 985 --n; 986 } 987 988 /* 989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW 990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/ 991 */ 992 993 /* 994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n 995 */ 996 997 ab = LLVMBuildMul(builder, a, b, ""); 998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), ""); 999 1000 /* 1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1)) 1002 */ 1003 1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1)); 1005 if (wide_type.sign) { 1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, ""); 1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1); 1008 half = lp_build_select(&bld, sign, minus_half, half); 1009 } 1010 ab = LLVMBuildAdd(builder, ab, half, ""); 1011 1012 /* Final division */ 1013 ab = lp_build_shr_imm(&bld, ab, n); 1014 1015 return ab; 1016 } 1017 1018 /** 1019 * Generate a * b 1020 */ 1021 LLVMValueRef 1022 lp_build_mul(struct lp_build_context *bld, 1023 LLVMValueRef a, 1024 LLVMValueRef b) 1025 { 1026 LLVMBuilderRef builder = bld->gallivm->builder; 1027 const struct lp_type type = bld->type; 1028 LLVMValueRef shift; 1029 LLVMValueRef res; 1030 1031 assert(lp_check_value(type, a)); 1032 assert(lp_check_value(type, b)); 1033 1034 if(a == bld->zero) 1035 return bld->zero; 1036 if(a == bld->one) 1037 return b; 1038 if(b == bld->zero) 1039 return bld->zero; 1040 if(b == bld->one) 1041 return a; 1042 if(a == bld->undef || b == bld->undef) 1043 return bld->undef; 1044 1045 if (!type.floating && !type.fixed && type.norm) { 1046 struct lp_type wide_type = lp_wider_type(type); 1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 1048 1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); 1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh); 1051 1052 /* PMULLW, PSRLW, PADDW */ 1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); 1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); 1055 1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh); 1057 1058 return ab; 1059 } 1060 1061 if(type.fixed) 1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 1063 else 1064 shift = NULL; 1065 1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1067 if (type.floating) 1068 res = LLVMConstFMul(a, b); 1069 else 1070 res = LLVMConstMul(a, b); 1071 if(shift) { 1072 if(type.sign) 1073 res = LLVMConstAShr(res, shift); 1074 else 1075 res = LLVMConstLShr(res, shift); 1076 } 1077 } 1078 else { 1079 if (type.floating) 1080 res = LLVMBuildFMul(builder, a, b, ""); 1081 else 1082 res = LLVMBuildMul(builder, a, b, ""); 1083 if(shift) { 1084 if(type.sign) 1085 res = LLVMBuildAShr(builder, res, shift, ""); 1086 else 1087 res = LLVMBuildLShr(builder, res, shift, ""); 1088 } 1089 } 1090 1091 return res; 1092 } 1093 1094 /* 1095 * Widening mul, valid for 32x32 bit -> 64bit only. 1096 * Result is low 32bits, high bits returned in res_hi. 1097 * 1098 * Emits code that is meant to be compiled for the host CPU. 1099 */ 1100 LLVMValueRef 1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, 1102 LLVMValueRef a, 1103 LLVMValueRef b, 1104 LLVMValueRef *res_hi) 1105 { 1106 struct gallivm_state *gallivm = bld->gallivm; 1107 LLVMBuilderRef builder = gallivm->builder; 1108 1109 assert(bld->type.width == 32); 1110 assert(bld->type.floating == 0); 1111 assert(bld->type.fixed == 0); 1112 assert(bld->type.norm == 0); 1113 1114 /* 1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces 1116 * for x86 simd is atrocious (even if the high bits weren't required), 1117 * trying to handle real 64bit inputs (which of course can't happen due 1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but 1119 * apparently llvm does not recognize this widening mul). This includes 6 1120 * (instead of 2) pmuludq plus extra adds and shifts 1121 * The same story applies to signed mul, albeit fixing this requires sse41. 1122 * https://llvm.org/bugs/show_bug.cgi?id=30845 1123 * So, whip up our own code, albeit only for length 4 and 8 (which 1124 * should be good enough)... 1125 */ 1126 if ((bld->type.length == 4 || bld->type.length == 8) && 1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || 1128 util_cpu_caps.has_sse4_1)) { 1129 const char *intrinsic = NULL; 1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; 1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; 1132 struct lp_type type_wide = lp_wider_type(bld->type); 1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide); 1134 unsigned i; 1135 for (i = 0; i < bld->type.length; i += 2) { 1136 shuf[i] = lp_build_const_int32(gallivm, i+1); 1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1138 } 1139 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1140 aeven = a; 1141 beven = b; 1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); 1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); 1144 1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) { 1146 if (bld->type.sign) { 1147 intrinsic = "llvm.x86.avx2.pmul.dq"; 1148 } else { 1149 intrinsic = "llvm.x86.avx2.pmulu.dq"; 1150 } 1151 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1152 wider_type, aeven, beven); 1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1154 wider_type, aodd, bodd); 1155 } 1156 else { 1157 /* for consistent naming look elsewhere... */ 1158 if (bld->type.sign) { 1159 intrinsic = "llvm.x86.sse41.pmuldq"; 1160 } else { 1161 intrinsic = "llvm.x86.sse2.pmulu.dq"; 1162 } 1163 /* 1164 * XXX If we only have AVX but not AVX2 this is a pain. 1165 * lp_build_intrinsic_binary_anylength() can't handle it 1166 * (due to src and dst type not being identical). 1167 */ 1168 if (bld->type.length == 8) { 1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi; 1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi; 1171 LLVMValueRef muleven2[2], mulodd2[2]; 1172 struct lp_type type_wide_half = type_wide; 1173 LLVMTypeRef wtype_half; 1174 type_wide_half.length = 2; 1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half); 1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4); 1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4); 1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4); 1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4); 1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4); 1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4); 1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4); 1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4); 1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1185 wtype_half, aevenlo, bevenlo); 1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1187 wtype_half, aoddlo, boddlo); 1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1189 wtype_half, aevenhi, bevenhi); 1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1191 wtype_half, aoddhi, boddhi); 1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2); 1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2); 1194 1195 } 1196 else { 1197 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1198 wider_type, aeven, beven); 1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1200 wider_type, aodd, bodd); 1201 } 1202 } 1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, ""); 1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, ""); 1205 1206 for (i = 0; i < bld->type.length; i += 2) { 1207 shuf[i] = lp_build_const_int32(gallivm, i + 1); 1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length); 1209 } 1210 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1212 1213 for (i = 0; i < bld->type.length; i += 2) { 1214 shuf[i] = lp_build_const_int32(gallivm, i); 1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length); 1216 } 1217 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1219 } 1220 else { 1221 return lp_build_mul_32_lohi(bld, a, b, res_hi); 1222 } 1223 } 1224 1225 1226 /* 1227 * Widening mul, valid for 32x32 bit -> 64bit only. 1228 * Result is low 32bits, high bits returned in res_hi. 1229 * 1230 * Emits generic code. 1231 */ 1232 LLVMValueRef 1233 lp_build_mul_32_lohi(struct lp_build_context *bld, 1234 LLVMValueRef a, 1235 LLVMValueRef b, 1236 LLVMValueRef *res_hi) 1237 { 1238 struct gallivm_state *gallivm = bld->gallivm; 1239 LLVMBuilderRef builder = gallivm->builder; 1240 LLVMValueRef tmp, shift, res_lo; 1241 struct lp_type type_tmp; 1242 LLVMTypeRef wide_type, narrow_type; 1243 1244 type_tmp = bld->type; 1245 narrow_type = lp_build_vec_type(gallivm, type_tmp); 1246 type_tmp.width *= 2; 1247 wide_type = lp_build_vec_type(gallivm, type_tmp); 1248 shift = lp_build_const_vec(gallivm, type_tmp, 32); 1249 1250 if (bld->type.sign) { 1251 a = LLVMBuildSExt(builder, a, wide_type, ""); 1252 b = LLVMBuildSExt(builder, b, wide_type, ""); 1253 } else { 1254 a = LLVMBuildZExt(builder, a, wide_type, ""); 1255 b = LLVMBuildZExt(builder, b, wide_type, ""); 1256 } 1257 tmp = LLVMBuildMul(builder, a, b, ""); 1258 1259 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1260 1261 /* Since we truncate anyway, LShr and AShr are equivalent. */ 1262 tmp = LLVMBuildLShr(builder, tmp, shift, ""); 1263 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1264 1265 return res_lo; 1266 } 1267 1268 1269 /* a * b + c */ 1270 LLVMValueRef 1271 lp_build_mad(struct lp_build_context *bld, 1272 LLVMValueRef a, 1273 LLVMValueRef b, 1274 LLVMValueRef c) 1275 { 1276 const struct lp_type type = bld->type; 1277 if (type.floating) { 1278 return lp_build_fmuladd(bld->gallivm->builder, a, b, c); 1279 } else { 1280 return lp_build_add(bld, lp_build_mul(bld, a, b), c); 1281 } 1282 } 1283 1284 1285 /** 1286 * Small vector x scale multiplication optimization. 1287 */ 1288 LLVMValueRef 1289 lp_build_mul_imm(struct lp_build_context *bld, 1290 LLVMValueRef a, 1291 int b) 1292 { 1293 LLVMBuilderRef builder = bld->gallivm->builder; 1294 LLVMValueRef factor; 1295 1296 assert(lp_check_value(bld->type, a)); 1297 1298 if(b == 0) 1299 return bld->zero; 1300 1301 if(b == 1) 1302 return a; 1303 1304 if(b == -1) 1305 return lp_build_negate(bld, a); 1306 1307 if(b == 2 && bld->type.floating) 1308 return lp_build_add(bld, a, a); 1309 1310 if(util_is_power_of_two(b)) { 1311 unsigned shift = ffs(b) - 1; 1312 1313 if(bld->type.floating) { 1314 #if 0 1315 /* 1316 * Power of two multiplication by directly manipulating the exponent. 1317 * 1318 * XXX: This might not be always faster, it will introduce a small error 1319 * for multiplication by zero, and it will produce wrong results 1320 * for Inf and NaN. 1321 */ 1322 unsigned mantissa = lp_mantissa(bld->type); 1323 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 1324 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 1325 a = LLVMBuildAdd(builder, a, factor, ""); 1326 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 1327 return a; 1328 #endif 1329 } 1330 else { 1331 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 1332 return LLVMBuildShl(builder, a, factor, ""); 1333 } 1334 } 1335 1336 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 1337 return lp_build_mul(bld, a, factor); 1338 } 1339 1340 1341 /** 1342 * Generate a / b 1343 */ 1344 LLVMValueRef 1345 lp_build_div(struct lp_build_context *bld, 1346 LLVMValueRef a, 1347 LLVMValueRef b) 1348 { 1349 LLVMBuilderRef builder = bld->gallivm->builder; 1350 const struct lp_type type = bld->type; 1351 1352 assert(lp_check_value(type, a)); 1353 assert(lp_check_value(type, b)); 1354 1355 if(a == bld->zero) 1356 return bld->zero; 1357 if(a == bld->one && type.floating) 1358 return lp_build_rcp(bld, b); 1359 if(b == bld->zero) 1360 return bld->undef; 1361 if(b == bld->one) 1362 return a; 1363 if(a == bld->undef || b == bld->undef) 1364 return bld->undef; 1365 1366 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1367 if (type.floating) 1368 return LLVMConstFDiv(a, b); 1369 else if (type.sign) 1370 return LLVMConstSDiv(a, b); 1371 else 1372 return LLVMConstUDiv(a, b); 1373 } 1374 1375 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1376 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && 1377 type.floating) 1378 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 1379 1380 if (type.floating) 1381 return LLVMBuildFDiv(builder, a, b, ""); 1382 else if (type.sign) 1383 return LLVMBuildSDiv(builder, a, b, ""); 1384 else 1385 return LLVMBuildUDiv(builder, a, b, ""); 1386 } 1387 1388 1389 /** 1390 * Linear interpolation helper. 1391 * 1392 * @param normalized whether we are interpolating normalized values, 1393 * encoded in normalized integers, twice as wide. 1394 * 1395 * @sa http://www.stereopsis.com/doubleblend.html 1396 */ 1397 static inline LLVMValueRef 1398 lp_build_lerp_simple(struct lp_build_context *bld, 1399 LLVMValueRef x, 1400 LLVMValueRef v0, 1401 LLVMValueRef v1, 1402 unsigned flags) 1403 { 1404 unsigned half_width = bld->type.width/2; 1405 LLVMBuilderRef builder = bld->gallivm->builder; 1406 LLVMValueRef delta; 1407 LLVMValueRef res; 1408 1409 assert(lp_check_value(bld->type, x)); 1410 assert(lp_check_value(bld->type, v0)); 1411 assert(lp_check_value(bld->type, v1)); 1412 1413 delta = lp_build_sub(bld, v1, v0); 1414 1415 if (bld->type.floating) { 1416 assert(flags == 0); 1417 return lp_build_mad(bld, x, delta, v0); 1418 } 1419 1420 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) { 1421 if (!bld->type.sign) { 1422 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) { 1423 /* 1424 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the 1425 * most-significant-bit to the lowest-significant-bit, so that 1426 * later we can just divide by 2**n instead of 2**n - 1. 1427 */ 1428 1429 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1)); 1430 } 1431 1432 /* (x * delta) >> n */ 1433 res = lp_build_mul(bld, x, delta); 1434 res = lp_build_shr_imm(bld, res, half_width); 1435 } else { 1436 /* 1437 * The rescaling trick above doesn't work for signed numbers, so 1438 * use the 2**n - 1 divison approximation in lp_build_mul_norm 1439 * instead. 1440 */ 1441 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1442 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta); 1443 } 1444 } else { 1445 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1446 res = lp_build_mul(bld, x, delta); 1447 } 1448 1449 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) { 1450 /* 1451 * At this point both res and v0 only use the lower half of the bits, 1452 * the rest is zero. Instead of add / mask, do add with half wide type. 1453 */ 1454 struct lp_type narrow_type; 1455 struct lp_build_context narrow_bld; 1456 1457 memset(&narrow_type, 0, sizeof narrow_type); 1458 narrow_type.sign = bld->type.sign; 1459 narrow_type.width = bld->type.width/2; 1460 narrow_type.length = bld->type.length*2; 1461 1462 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type); 1463 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, ""); 1464 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, ""); 1465 res = lp_build_add(&narrow_bld, v0, res); 1466 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 1467 } else { 1468 res = lp_build_add(bld, v0, res); 1469 1470 if (bld->type.fixed) { 1471 /* 1472 * We need to mask out the high order bits when lerping 8bit 1473 * normalized colors stored on 16bits 1474 */ 1475 /* XXX: This step is necessary for lerping 8bit colors stored on 1476 * 16bits, but it will be wrong for true fixed point use cases. 1477 * Basically we need a more powerful lp_type, capable of further 1478 * distinguishing the values interpretation from the value storage. 1479 */ 1480 LLVMValueRef low_bits; 1481 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1); 1482 res = LLVMBuildAnd(builder, res, low_bits, ""); 1483 } 1484 } 1485 1486 return res; 1487 } 1488 1489 1490 /** 1491 * Linear interpolation. 1492 */ 1493 LLVMValueRef 1494 lp_build_lerp(struct lp_build_context *bld, 1495 LLVMValueRef x, 1496 LLVMValueRef v0, 1497 LLVMValueRef v1, 1498 unsigned flags) 1499 { 1500 const struct lp_type type = bld->type; 1501 LLVMValueRef res; 1502 1503 assert(lp_check_value(type, x)); 1504 assert(lp_check_value(type, v0)); 1505 assert(lp_check_value(type, v1)); 1506 1507 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED)); 1508 1509 if (type.norm) { 1510 struct lp_type wide_type; 1511 struct lp_build_context wide_bld; 1512 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 1513 1514 assert(type.length >= 2); 1515 1516 /* 1517 * Create a wider integer type, enough to hold the 1518 * intermediate result of the multiplication. 1519 */ 1520 memset(&wide_type, 0, sizeof wide_type); 1521 wide_type.sign = type.sign; 1522 wide_type.width = type.width*2; 1523 wide_type.length = type.length/2; 1524 1525 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 1526 1527 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh); 1528 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 1529 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 1530 1531 /* 1532 * Lerp both halves. 1533 */ 1534 1535 flags |= LP_BLD_LERP_WIDE_NORMALIZED; 1536 1537 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags); 1538 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags); 1539 1540 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh); 1541 } else { 1542 res = lp_build_lerp_simple(bld, x, v0, v1, flags); 1543 } 1544 1545 return res; 1546 } 1547 1548 1549 /** 1550 * Bilinear interpolation. 1551 * 1552 * Values indices are in v_{yx}. 1553 */ 1554 LLVMValueRef 1555 lp_build_lerp_2d(struct lp_build_context *bld, 1556 LLVMValueRef x, 1557 LLVMValueRef y, 1558 LLVMValueRef v00, 1559 LLVMValueRef v01, 1560 LLVMValueRef v10, 1561 LLVMValueRef v11, 1562 unsigned flags) 1563 { 1564 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags); 1565 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags); 1566 return lp_build_lerp(bld, y, v0, v1, flags); 1567 } 1568 1569 1570 LLVMValueRef 1571 lp_build_lerp_3d(struct lp_build_context *bld, 1572 LLVMValueRef x, 1573 LLVMValueRef y, 1574 LLVMValueRef z, 1575 LLVMValueRef v000, 1576 LLVMValueRef v001, 1577 LLVMValueRef v010, 1578 LLVMValueRef v011, 1579 LLVMValueRef v100, 1580 LLVMValueRef v101, 1581 LLVMValueRef v110, 1582 LLVMValueRef v111, 1583 unsigned flags) 1584 { 1585 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags); 1586 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags); 1587 return lp_build_lerp(bld, z, v0, v1, flags); 1588 } 1589 1590 1591 /** 1592 * Generate min(a, b) 1593 * Do checks for special cases but not for nans. 1594 */ 1595 LLVMValueRef 1596 lp_build_min(struct lp_build_context *bld, 1597 LLVMValueRef a, 1598 LLVMValueRef b) 1599 { 1600 assert(lp_check_value(bld->type, a)); 1601 assert(lp_check_value(bld->type, b)); 1602 1603 if(a == bld->undef || b == bld->undef) 1604 return bld->undef; 1605 1606 if(a == b) 1607 return a; 1608 1609 if (bld->type.norm) { 1610 if (!bld->type.sign) { 1611 if (a == bld->zero || b == bld->zero) { 1612 return bld->zero; 1613 } 1614 } 1615 if(a == bld->one) 1616 return b; 1617 if(b == bld->one) 1618 return a; 1619 } 1620 1621 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1622 } 1623 1624 1625 /** 1626 * Generate min(a, b) 1627 * NaN's are handled according to the behavior specified by the 1628 * nan_behavior argument. 1629 */ 1630 LLVMValueRef 1631 lp_build_min_ext(struct lp_build_context *bld, 1632 LLVMValueRef a, 1633 LLVMValueRef b, 1634 enum gallivm_nan_behavior nan_behavior) 1635 { 1636 assert(lp_check_value(bld->type, a)); 1637 assert(lp_check_value(bld->type, b)); 1638 1639 if(a == bld->undef || b == bld->undef) 1640 return bld->undef; 1641 1642 if(a == b) 1643 return a; 1644 1645 if (bld->type.norm) { 1646 if (!bld->type.sign) { 1647 if (a == bld->zero || b == bld->zero) { 1648 return bld->zero; 1649 } 1650 } 1651 if(a == bld->one) 1652 return b; 1653 if(b == bld->one) 1654 return a; 1655 } 1656 1657 return lp_build_min_simple(bld, a, b, nan_behavior); 1658 } 1659 1660 /** 1661 * Generate max(a, b) 1662 * Do checks for special cases, but NaN behavior is undefined. 1663 */ 1664 LLVMValueRef 1665 lp_build_max(struct lp_build_context *bld, 1666 LLVMValueRef a, 1667 LLVMValueRef b) 1668 { 1669 assert(lp_check_value(bld->type, a)); 1670 assert(lp_check_value(bld->type, b)); 1671 1672 if(a == bld->undef || b == bld->undef) 1673 return bld->undef; 1674 1675 if(a == b) 1676 return a; 1677 1678 if(bld->type.norm) { 1679 if(a == bld->one || b == bld->one) 1680 return bld->one; 1681 if (!bld->type.sign) { 1682 if (a == bld->zero) { 1683 return b; 1684 } 1685 if (b == bld->zero) { 1686 return a; 1687 } 1688 } 1689 } 1690 1691 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1692 } 1693 1694 1695 /** 1696 * Generate max(a, b) 1697 * Checks for special cases. 1698 * NaN's are handled according to the behavior specified by the 1699 * nan_behavior argument. 1700 */ 1701 LLVMValueRef 1702 lp_build_max_ext(struct lp_build_context *bld, 1703 LLVMValueRef a, 1704 LLVMValueRef b, 1705 enum gallivm_nan_behavior nan_behavior) 1706 { 1707 assert(lp_check_value(bld->type, a)); 1708 assert(lp_check_value(bld->type, b)); 1709 1710 if(a == bld->undef || b == bld->undef) 1711 return bld->undef; 1712 1713 if(a == b) 1714 return a; 1715 1716 if(bld->type.norm) { 1717 if(a == bld->one || b == bld->one) 1718 return bld->one; 1719 if (!bld->type.sign) { 1720 if (a == bld->zero) { 1721 return b; 1722 } 1723 if (b == bld->zero) { 1724 return a; 1725 } 1726 } 1727 } 1728 1729 return lp_build_max_simple(bld, a, b, nan_behavior); 1730 } 1731 1732 /** 1733 * Generate clamp(a, min, max) 1734 * NaN behavior (for any of a, min, max) is undefined. 1735 * Do checks for special cases. 1736 */ 1737 LLVMValueRef 1738 lp_build_clamp(struct lp_build_context *bld, 1739 LLVMValueRef a, 1740 LLVMValueRef min, 1741 LLVMValueRef max) 1742 { 1743 assert(lp_check_value(bld->type, a)); 1744 assert(lp_check_value(bld->type, min)); 1745 assert(lp_check_value(bld->type, max)); 1746 1747 a = lp_build_min(bld, a, max); 1748 a = lp_build_max(bld, a, min); 1749 return a; 1750 } 1751 1752 1753 /** 1754 * Generate clamp(a, 0, 1) 1755 * A NaN will get converted to zero. 1756 */ 1757 LLVMValueRef 1758 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld, 1759 LLVMValueRef a) 1760 { 1761 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 1762 a = lp_build_min(bld, a, bld->one); 1763 return a; 1764 } 1765 1766 1767 /** 1768 * Generate abs(a) 1769 */ 1770 LLVMValueRef 1771 lp_build_abs(struct lp_build_context *bld, 1772 LLVMValueRef a) 1773 { 1774 LLVMBuilderRef builder = bld->gallivm->builder; 1775 const struct lp_type type = bld->type; 1776 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1777 1778 assert(lp_check_value(type, a)); 1779 1780 if(!type.sign) 1781 return a; 1782 1783 if(type.floating) { 1784 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) { 1785 /* Workaround llvm.org/PR27332 */ 1786 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1787 unsigned long long absMask = ~(1ULL << (type.width - 1)); 1788 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 1789 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1790 a = LLVMBuildAnd(builder, a, mask, ""); 1791 a = LLVMBuildBitCast(builder, a, vec_type, ""); 1792 return a; 1793 } else { 1794 char intrinsic[32]; 1795 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); 1796 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1797 } 1798 } 1799 1800 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 1801 switch(type.width) { 1802 case 8: 1803 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1804 case 16: 1805 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1806 case 32: 1807 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1808 } 1809 } 1810 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) { 1811 switch(type.width) { 1812 case 8: 1813 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); 1814 case 16: 1815 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a); 1816 case 32: 1817 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a); 1818 } 1819 } 1820 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 && 1821 (gallivm_debug & GALLIVM_DEBUG_PERF) && 1822 (type.width == 8 || type.width == 16 || type.width == 32)) { 1823 debug_printf("%s: inefficient code, should split vectors manually\n", 1824 __FUNCTION__); 1825 } 1826 1827 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 1828 } 1829 1830 1831 LLVMValueRef 1832 lp_build_negate(struct lp_build_context *bld, 1833 LLVMValueRef a) 1834 { 1835 LLVMBuilderRef builder = bld->gallivm->builder; 1836 1837 assert(lp_check_value(bld->type, a)); 1838 1839 if (bld->type.floating) 1840 a = LLVMBuildFNeg(builder, a, ""); 1841 else 1842 a = LLVMBuildNeg(builder, a, ""); 1843 1844 return a; 1845 } 1846 1847 1848 /** Return -1, 0 or +1 depending on the sign of a */ 1849 LLVMValueRef 1850 lp_build_sgn(struct lp_build_context *bld, 1851 LLVMValueRef a) 1852 { 1853 LLVMBuilderRef builder = bld->gallivm->builder; 1854 const struct lp_type type = bld->type; 1855 LLVMValueRef cond; 1856 LLVMValueRef res; 1857 1858 assert(lp_check_value(type, a)); 1859 1860 /* Handle non-zero case */ 1861 if(!type.sign) { 1862 /* if not zero then sign must be positive */ 1863 res = bld->one; 1864 } 1865 else if(type.floating) { 1866 LLVMTypeRef vec_type; 1867 LLVMTypeRef int_type; 1868 LLVMValueRef mask; 1869 LLVMValueRef sign; 1870 LLVMValueRef one; 1871 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1872 1873 int_type = lp_build_int_vec_type(bld->gallivm, type); 1874 vec_type = lp_build_vec_type(bld->gallivm, type); 1875 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1876 1877 /* Take the sign bit and add it to 1 constant */ 1878 sign = LLVMBuildBitCast(builder, a, int_type, ""); 1879 sign = LLVMBuildAnd(builder, sign, mask, ""); 1880 one = LLVMConstBitCast(bld->one, int_type); 1881 res = LLVMBuildOr(builder, sign, one, ""); 1882 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1883 } 1884 else 1885 { 1886 /* signed int/norm/fixed point */ 1887 /* could use psign with sse3 and appropriate vectors here */ 1888 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1889 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1890 res = lp_build_select(bld, cond, bld->one, minus_one); 1891 } 1892 1893 /* Handle zero */ 1894 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1895 res = lp_build_select(bld, cond, bld->zero, res); 1896 1897 return res; 1898 } 1899 1900 1901 /** 1902 * Set the sign of float vector 'a' according to 'sign'. 1903 * If sign==0, return abs(a). 1904 * If sign==1, return -abs(a); 1905 * Other values for sign produce undefined results. 1906 */ 1907 LLVMValueRef 1908 lp_build_set_sign(struct lp_build_context *bld, 1909 LLVMValueRef a, LLVMValueRef sign) 1910 { 1911 LLVMBuilderRef builder = bld->gallivm->builder; 1912 const struct lp_type type = bld->type; 1913 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1914 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1915 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1916 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1917 ~((unsigned long long) 1 << (type.width - 1))); 1918 LLVMValueRef val, res; 1919 1920 assert(type.floating); 1921 assert(lp_check_value(type, a)); 1922 1923 /* val = reinterpret_cast<int>(a) */ 1924 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1925 /* val = val & mask */ 1926 val = LLVMBuildAnd(builder, val, mask, ""); 1927 /* sign = sign << shift */ 1928 sign = LLVMBuildShl(builder, sign, shift, ""); 1929 /* res = val | sign */ 1930 res = LLVMBuildOr(builder, val, sign, ""); 1931 /* res = reinterpret_cast<float>(res) */ 1932 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1933 1934 return res; 1935 } 1936 1937 1938 /** 1939 * Convert vector of (or scalar) int to vector of (or scalar) float. 1940 */ 1941 LLVMValueRef 1942 lp_build_int_to_float(struct lp_build_context *bld, 1943 LLVMValueRef a) 1944 { 1945 LLVMBuilderRef builder = bld->gallivm->builder; 1946 const struct lp_type type = bld->type; 1947 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1948 1949 assert(type.floating); 1950 1951 return LLVMBuildSIToFP(builder, a, vec_type, ""); 1952 } 1953 1954 static boolean 1955 arch_rounding_available(const struct lp_type type) 1956 { 1957 if ((util_cpu_caps.has_sse4_1 && 1958 (type.length == 1 || type.width*type.length == 128)) || 1959 (util_cpu_caps.has_avx && type.width*type.length == 256)) 1960 return TRUE; 1961 else if ((util_cpu_caps.has_altivec && 1962 (type.width == 32 && type.length == 4))) 1963 return TRUE; 1964 1965 return FALSE; 1966 } 1967 1968 enum lp_build_round_mode 1969 { 1970 LP_BUILD_ROUND_NEAREST = 0, 1971 LP_BUILD_ROUND_FLOOR = 1, 1972 LP_BUILD_ROUND_CEIL = 2, 1973 LP_BUILD_ROUND_TRUNCATE = 3 1974 }; 1975 1976 static inline LLVMValueRef 1977 lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1978 LLVMValueRef a) 1979 { 1980 LLVMBuilderRef builder = bld->gallivm->builder; 1981 const struct lp_type type = bld->type; 1982 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1983 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1984 const char *intrinsic; 1985 LLVMValueRef res; 1986 1987 assert(type.floating); 1988 /* using the double precision conversions is a bit more complicated */ 1989 assert(type.width == 32); 1990 1991 assert(lp_check_value(type, a)); 1992 assert(util_cpu_caps.has_sse2); 1993 1994 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1995 if (type.length == 1) { 1996 LLVMTypeRef vec_type; 1997 LLVMValueRef undef; 1998 LLVMValueRef arg; 1999 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 2000 2001 vec_type = LLVMVectorType(bld->elem_type, 4); 2002 2003 intrinsic = "llvm.x86.sse.cvtss2si"; 2004 2005 undef = LLVMGetUndef(vec_type); 2006 2007 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 2008 2009 res = lp_build_intrinsic_unary(builder, intrinsic, 2010 ret_type, arg); 2011 } 2012 else { 2013 if (type.width* type.length == 128) { 2014 intrinsic = "llvm.x86.sse2.cvtps2dq"; 2015 } 2016 else { 2017 assert(type.width*type.length == 256); 2018 assert(util_cpu_caps.has_avx); 2019 2020 intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 2021 } 2022 res = lp_build_intrinsic_unary(builder, intrinsic, 2023 ret_type, a); 2024 } 2025 2026 return res; 2027 } 2028 2029 2030 /* 2031 */ 2032 static inline LLVMValueRef 2033 lp_build_round_altivec(struct lp_build_context *bld, 2034 LLVMValueRef a, 2035 enum lp_build_round_mode mode) 2036 { 2037 LLVMBuilderRef builder = bld->gallivm->builder; 2038 const struct lp_type type = bld->type; 2039 const char *intrinsic = NULL; 2040 2041 assert(type.floating); 2042 2043 assert(lp_check_value(type, a)); 2044 assert(util_cpu_caps.has_altivec); 2045 2046 (void)type; 2047 2048 switch (mode) { 2049 case LP_BUILD_ROUND_NEAREST: 2050 intrinsic = "llvm.ppc.altivec.vrfin"; 2051 break; 2052 case LP_BUILD_ROUND_FLOOR: 2053 intrinsic = "llvm.ppc.altivec.vrfim"; 2054 break; 2055 case LP_BUILD_ROUND_CEIL: 2056 intrinsic = "llvm.ppc.altivec.vrfip"; 2057 break; 2058 case LP_BUILD_ROUND_TRUNCATE: 2059 intrinsic = "llvm.ppc.altivec.vrfiz"; 2060 break; 2061 } 2062 2063 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2064 } 2065 2066 static inline LLVMValueRef 2067 lp_build_round_arch(struct lp_build_context *bld, 2068 LLVMValueRef a, 2069 enum lp_build_round_mode mode) 2070 { 2071 if (util_cpu_caps.has_sse4_1) { 2072 LLVMBuilderRef builder = bld->gallivm->builder; 2073 const struct lp_type type = bld->type; 2074 const char *intrinsic_root; 2075 char intrinsic[32]; 2076 2077 assert(type.floating); 2078 assert(lp_check_value(type, a)); 2079 (void)type; 2080 2081 switch (mode) { 2082 case LP_BUILD_ROUND_NEAREST: 2083 intrinsic_root = "llvm.nearbyint"; 2084 break; 2085 case LP_BUILD_ROUND_FLOOR: 2086 intrinsic_root = "llvm.floor"; 2087 break; 2088 case LP_BUILD_ROUND_CEIL: 2089 intrinsic_root = "llvm.ceil"; 2090 break; 2091 case LP_BUILD_ROUND_TRUNCATE: 2092 intrinsic_root = "llvm.trunc"; 2093 break; 2094 } 2095 2096 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); 2097 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2098 } 2099 else /* (util_cpu_caps.has_altivec) */ 2100 return lp_build_round_altivec(bld, a, mode); 2101 } 2102 2103 /** 2104 * Return the integer part of a float (vector) value (== round toward zero). 2105 * The returned value is a float (vector). 2106 * Ex: trunc(-1.5) = -1.0 2107 */ 2108 LLVMValueRef 2109 lp_build_trunc(struct lp_build_context *bld, 2110 LLVMValueRef a) 2111 { 2112 LLVMBuilderRef builder = bld->gallivm->builder; 2113 const struct lp_type type = bld->type; 2114 2115 assert(type.floating); 2116 assert(lp_check_value(type, a)); 2117 2118 if (arch_rounding_available(type)) { 2119 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE); 2120 } 2121 else { 2122 const struct lp_type type = bld->type; 2123 struct lp_type inttype; 2124 struct lp_build_context intbld; 2125 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2126 LLVMValueRef trunc, res, anosign, mask; 2127 LLVMTypeRef int_vec_type = bld->int_vec_type; 2128 LLVMTypeRef vec_type = bld->vec_type; 2129 2130 assert(type.width == 32); /* might want to handle doubles at some point */ 2131 2132 inttype = type; 2133 inttype.floating = 0; 2134 lp_build_context_init(&intbld, bld->gallivm, inttype); 2135 2136 /* round by truncation */ 2137 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2138 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2139 2140 /* mask out sign bit */ 2141 anosign = lp_build_abs(bld, a); 2142 /* 2143 * mask out all values if anosign > 2^24 2144 * This should work both for large ints (all rounding is no-op for them 2145 * because such floats are always exact) as well as special cases like 2146 * NaNs, Infs (taking advantage of the fact they use max exponent). 2147 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2148 */ 2149 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2150 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2151 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2152 return lp_build_select(bld, mask, a, res); 2153 } 2154 } 2155 2156 2157 /** 2158 * Return float (vector) rounded to nearest integer (vector). The returned 2159 * value is a float (vector). 2160 * Ex: round(0.9) = 1.0 2161 * Ex: round(-1.5) = -2.0 2162 */ 2163 LLVMValueRef 2164 lp_build_round(struct lp_build_context *bld, 2165 LLVMValueRef a) 2166 { 2167 LLVMBuilderRef builder = bld->gallivm->builder; 2168 const struct lp_type type = bld->type; 2169 2170 assert(type.floating); 2171 assert(lp_check_value(type, a)); 2172 2173 if (arch_rounding_available(type)) { 2174 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2175 } 2176 else { 2177 const struct lp_type type = bld->type; 2178 struct lp_type inttype; 2179 struct lp_build_context intbld; 2180 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2181 LLVMValueRef res, anosign, mask; 2182 LLVMTypeRef int_vec_type = bld->int_vec_type; 2183 LLVMTypeRef vec_type = bld->vec_type; 2184 2185 assert(type.width == 32); /* might want to handle doubles at some point */ 2186 2187 inttype = type; 2188 inttype.floating = 0; 2189 lp_build_context_init(&intbld, bld->gallivm, inttype); 2190 2191 res = lp_build_iround(bld, a); 2192 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 2193 2194 /* mask out sign bit */ 2195 anosign = lp_build_abs(bld, a); 2196 /* 2197 * mask out all values if anosign > 2^24 2198 * This should work both for large ints (all rounding is no-op for them 2199 * because such floats are always exact) as well as special cases like 2200 * NaNs, Infs (taking advantage of the fact they use max exponent). 2201 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2202 */ 2203 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2204 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2205 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2206 return lp_build_select(bld, mask, a, res); 2207 } 2208 } 2209 2210 2211 /** 2212 * Return floor of float (vector), result is a float (vector) 2213 * Ex: floor(1.1) = 1.0 2214 * Ex: floor(-1.1) = -2.0 2215 */ 2216 LLVMValueRef 2217 lp_build_floor(struct lp_build_context *bld, 2218 LLVMValueRef a) 2219 { 2220 LLVMBuilderRef builder = bld->gallivm->builder; 2221 const struct lp_type type = bld->type; 2222 2223 assert(type.floating); 2224 assert(lp_check_value(type, a)); 2225 2226 if (arch_rounding_available(type)) { 2227 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2228 } 2229 else { 2230 const struct lp_type type = bld->type; 2231 struct lp_type inttype; 2232 struct lp_build_context intbld; 2233 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2234 LLVMValueRef trunc, res, anosign, mask; 2235 LLVMTypeRef int_vec_type = bld->int_vec_type; 2236 LLVMTypeRef vec_type = bld->vec_type; 2237 2238 if (type.width != 32) { 2239 char intrinsic[32]; 2240 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type); 2241 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2242 } 2243 2244 assert(type.width == 32); /* might want to handle doubles at some point */ 2245 2246 inttype = type; 2247 inttype.floating = 0; 2248 lp_build_context_init(&intbld, bld->gallivm, inttype); 2249 2250 /* round by truncation */ 2251 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2252 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2253 2254 if (type.sign) { 2255 LLVMValueRef tmp; 2256 2257 /* 2258 * fix values if rounding is wrong (for non-special cases) 2259 * - this is the case if trunc > a 2260 */ 2261 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a); 2262 /* tmp = trunc > a ? 1.0 : 0.0 */ 2263 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2264 tmp = lp_build_and(&intbld, mask, tmp); 2265 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2266 res = lp_build_sub(bld, res, tmp); 2267 } 2268 2269 /* mask out sign bit */ 2270 anosign = lp_build_abs(bld, a); 2271 /* 2272 * mask out all values if anosign > 2^24 2273 * This should work both for large ints (all rounding is no-op for them 2274 * because such floats are always exact) as well as special cases like 2275 * NaNs, Infs (taking advantage of the fact they use max exponent). 2276 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2277 */ 2278 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2279 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2280 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2281 return lp_build_select(bld, mask, a, res); 2282 } 2283 } 2284 2285 2286 /** 2287 * Return ceiling of float (vector), returning float (vector). 2288 * Ex: ceil( 1.1) = 2.0 2289 * Ex: ceil(-1.1) = -1.0 2290 */ 2291 LLVMValueRef 2292 lp_build_ceil(struct lp_build_context *bld, 2293 LLVMValueRef a) 2294 { 2295 LLVMBuilderRef builder = bld->gallivm->builder; 2296 const struct lp_type type = bld->type; 2297 2298 assert(type.floating); 2299 assert(lp_check_value(type, a)); 2300 2301 if (arch_rounding_available(type)) { 2302 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2303 } 2304 else { 2305 const struct lp_type type = bld->type; 2306 struct lp_type inttype; 2307 struct lp_build_context intbld; 2308 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2309 LLVMValueRef trunc, res, anosign, mask, tmp; 2310 LLVMTypeRef int_vec_type = bld->int_vec_type; 2311 LLVMTypeRef vec_type = bld->vec_type; 2312 2313 if (type.width != 32) { 2314 char intrinsic[32]; 2315 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type); 2316 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2317 } 2318 2319 assert(type.width == 32); /* might want to handle doubles at some point */ 2320 2321 inttype = type; 2322 inttype.floating = 0; 2323 lp_build_context_init(&intbld, bld->gallivm, inttype); 2324 2325 /* round by truncation */ 2326 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2327 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc"); 2328 2329 /* 2330 * fix values if rounding is wrong (for non-special cases) 2331 * - this is the case if trunc < a 2332 */ 2333 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2334 /* tmp = trunc < a ? 1.0 : 0.0 */ 2335 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2336 tmp = lp_build_and(&intbld, mask, tmp); 2337 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2338 res = lp_build_add(bld, trunc, tmp); 2339 2340 /* mask out sign bit */ 2341 anosign = lp_build_abs(bld, a); 2342 /* 2343 * mask out all values if anosign > 2^24 2344 * This should work both for large ints (all rounding is no-op for them 2345 * because such floats are always exact) as well as special cases like 2346 * NaNs, Infs (taking advantage of the fact they use max exponent). 2347 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2348 */ 2349 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2350 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2351 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2352 return lp_build_select(bld, mask, a, res); 2353 } 2354 } 2355 2356 2357 /** 2358 * Return fractional part of 'a' computed as a - floor(a) 2359 * Typically used in texture coord arithmetic. 2360 */ 2361 LLVMValueRef 2362 lp_build_fract(struct lp_build_context *bld, 2363 LLVMValueRef a) 2364 { 2365 assert(bld->type.floating); 2366 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 2367 } 2368 2369 2370 /** 2371 * Prevent returning 1.0 for very small negative values of 'a' by clamping 2372 * against 0.99999(9). (Will also return that value for NaNs.) 2373 */ 2374 static inline LLVMValueRef 2375 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 2376 { 2377 LLVMValueRef max; 2378 2379 /* this is the largest number smaller than 1.0 representable as float */ 2380 max = lp_build_const_vec(bld->gallivm, bld->type, 2381 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 2382 return lp_build_min_ext(bld, fract, max, 2383 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 2384 } 2385 2386 2387 /** 2388 * Same as lp_build_fract, but guarantees that the result is always smaller 2389 * than one. Will also return the smaller-than-one value for infs, NaNs. 2390 */ 2391 LLVMValueRef 2392 lp_build_fract_safe(struct lp_build_context *bld, 2393 LLVMValueRef a) 2394 { 2395 return clamp_fract(bld, lp_build_fract(bld, a)); 2396 } 2397 2398 2399 /** 2400 * Return the integer part of a float (vector) value (== round toward zero). 2401 * The returned value is an integer (vector). 2402 * Ex: itrunc(-1.5) = -1 2403 */ 2404 LLVMValueRef 2405 lp_build_itrunc(struct lp_build_context *bld, 2406 LLVMValueRef a) 2407 { 2408 LLVMBuilderRef builder = bld->gallivm->builder; 2409 const struct lp_type type = bld->type; 2410 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2411 2412 assert(type.floating); 2413 assert(lp_check_value(type, a)); 2414 2415 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2416 } 2417 2418 2419 /** 2420 * Return float (vector) rounded to nearest integer (vector). The returned 2421 * value is an integer (vector). 2422 * Ex: iround(0.9) = 1 2423 * Ex: iround(-1.5) = -2 2424 */ 2425 LLVMValueRef 2426 lp_build_iround(struct lp_build_context *bld, 2427 LLVMValueRef a) 2428 { 2429 LLVMBuilderRef builder = bld->gallivm->builder; 2430 const struct lp_type type = bld->type; 2431 LLVMTypeRef int_vec_type = bld->int_vec_type; 2432 LLVMValueRef res; 2433 2434 assert(type.floating); 2435 2436 assert(lp_check_value(type, a)); 2437 2438 if ((util_cpu_caps.has_sse2 && 2439 ((type.width == 32) && (type.length == 1 || type.length == 4))) || 2440 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 2441 return lp_build_iround_nearest_sse2(bld, a); 2442 } 2443 if (arch_rounding_available(type)) { 2444 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2445 } 2446 else { 2447 LLVMValueRef half; 2448 2449 half = lp_build_const_vec(bld->gallivm, type, 0.5); 2450 2451 if (type.sign) { 2452 LLVMTypeRef vec_type = bld->vec_type; 2453 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 2454 (unsigned long long)1 << (type.width - 1)); 2455 LLVMValueRef sign; 2456 2457 /* get sign bit */ 2458 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 2459 sign = LLVMBuildAnd(builder, sign, mask, ""); 2460 2461 /* sign * 0.5 */ 2462 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 2463 half = LLVMBuildOr(builder, sign, half, ""); 2464 half = LLVMBuildBitCast(builder, half, vec_type, ""); 2465 } 2466 2467 res = LLVMBuildFAdd(builder, a, half, ""); 2468 } 2469 2470 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 2471 2472 return res; 2473 } 2474 2475 2476 /** 2477 * Return floor of float (vector), result is an int (vector) 2478 * Ex: ifloor(1.1) = 1.0 2479 * Ex: ifloor(-1.1) = -2.0 2480 */ 2481 LLVMValueRef 2482 lp_build_ifloor(struct lp_build_context *bld, 2483 LLVMValueRef a) 2484 { 2485 LLVMBuilderRef builder = bld->gallivm->builder; 2486 const struct lp_type type = bld->type; 2487 LLVMTypeRef int_vec_type = bld->int_vec_type; 2488 LLVMValueRef res; 2489 2490 assert(type.floating); 2491 assert(lp_check_value(type, a)); 2492 2493 res = a; 2494 if (type.sign) { 2495 if (arch_rounding_available(type)) { 2496 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2497 } 2498 else { 2499 struct lp_type inttype; 2500 struct lp_build_context intbld; 2501 LLVMValueRef trunc, itrunc, mask; 2502 2503 assert(type.floating); 2504 assert(lp_check_value(type, a)); 2505 2506 inttype = type; 2507 inttype.floating = 0; 2508 lp_build_context_init(&intbld, bld->gallivm, inttype); 2509 2510 /* round by truncation */ 2511 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2512 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc"); 2513 2514 /* 2515 * fix values if rounding is wrong (for non-special cases) 2516 * - this is the case if trunc > a 2517 * The results of doing this with NaNs, very large values etc. 2518 * are undefined but this seems to be the case anyway. 2519 */ 2520 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a); 2521 /* cheapie minus one with mask since the mask is minus one / zero */ 2522 return lp_build_add(&intbld, itrunc, mask); 2523 } 2524 } 2525 2526 /* round to nearest (toward zero) */ 2527 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 2528 2529 return res; 2530 } 2531 2532 2533 /** 2534 * Return ceiling of float (vector), returning int (vector). 2535 * Ex: iceil( 1.1) = 2 2536 * Ex: iceil(-1.1) = -1 2537 */ 2538 LLVMValueRef 2539 lp_build_iceil(struct lp_build_context *bld, 2540 LLVMValueRef a) 2541 { 2542 LLVMBuilderRef builder = bld->gallivm->builder; 2543 const struct lp_type type = bld->type; 2544 LLVMTypeRef int_vec_type = bld->int_vec_type; 2545 LLVMValueRef res; 2546 2547 assert(type.floating); 2548 assert(lp_check_value(type, a)); 2549 2550 if (arch_rounding_available(type)) { 2551 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2552 } 2553 else { 2554 struct lp_type inttype; 2555 struct lp_build_context intbld; 2556 LLVMValueRef trunc, itrunc, mask; 2557 2558 assert(type.floating); 2559 assert(lp_check_value(type, a)); 2560 2561 inttype = type; 2562 inttype.floating = 0; 2563 lp_build_context_init(&intbld, bld->gallivm, inttype); 2564 2565 /* round by truncation */ 2566 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2567 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc"); 2568 2569 /* 2570 * fix values if rounding is wrong (for non-special cases) 2571 * - this is the case if trunc < a 2572 * The results of doing this with NaNs, very large values etc. 2573 * are undefined but this seems to be the case anyway. 2574 */ 2575 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2576 /* cheapie plus one with mask since the mask is minus one / zero */ 2577 return lp_build_sub(&intbld, itrunc, mask); 2578 } 2579 2580 /* round to nearest (toward zero) */ 2581 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 2582 2583 return res; 2584 } 2585 2586 2587 /** 2588 * Combined ifloor() & fract(). 2589 * 2590 * Preferred to calling the functions separately, as it will ensure that the 2591 * strategy (floor() vs ifloor()) that results in less redundant work is used. 2592 */ 2593 void 2594 lp_build_ifloor_fract(struct lp_build_context *bld, 2595 LLVMValueRef a, 2596 LLVMValueRef *out_ipart, 2597 LLVMValueRef *out_fpart) 2598 { 2599 LLVMBuilderRef builder = bld->gallivm->builder; 2600 const struct lp_type type = bld->type; 2601 LLVMValueRef ipart; 2602 2603 assert(type.floating); 2604 assert(lp_check_value(type, a)); 2605 2606 if (arch_rounding_available(type)) { 2607 /* 2608 * floor() is easier. 2609 */ 2610 2611 ipart = lp_build_floor(bld, a); 2612 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2613 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 2614 } 2615 else { 2616 /* 2617 * ifloor() is easier. 2618 */ 2619 2620 *out_ipart = lp_build_ifloor(bld, a); 2621 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 2622 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2623 } 2624 } 2625 2626 2627 /** 2628 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 2629 * always smaller than one. 2630 */ 2631 void 2632 lp_build_ifloor_fract_safe(struct lp_build_context *bld, 2633 LLVMValueRef a, 2634 LLVMValueRef *out_ipart, 2635 LLVMValueRef *out_fpart) 2636 { 2637 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 2638 *out_fpart = clamp_fract(bld, *out_fpart); 2639 } 2640 2641 2642 LLVMValueRef 2643 lp_build_sqrt(struct lp_build_context *bld, 2644 LLVMValueRef a) 2645 { 2646 LLVMBuilderRef builder = bld->gallivm->builder; 2647 const struct lp_type type = bld->type; 2648 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2649 char intrinsic[32]; 2650 2651 assert(lp_check_value(type, a)); 2652 2653 assert(type.floating); 2654 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type); 2655 2656 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2657 } 2658 2659 2660 /** 2661 * Do one Newton-Raphson step to improve reciprocate precision: 2662 * 2663 * x_{i+1} = x_i * (2 - a * x_i) 2664 * 2665 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 2666 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 2667 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 2668 * halo. It would be necessary to clamp the argument to prevent this. 2669 * 2670 * See also: 2671 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 2672 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 2673 */ 2674 static inline LLVMValueRef 2675 lp_build_rcp_refine(struct lp_build_context *bld, 2676 LLVMValueRef a, 2677 LLVMValueRef rcp_a) 2678 { 2679 LLVMBuilderRef builder = bld->gallivm->builder; 2680 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 2681 LLVMValueRef res; 2682 2683 res = LLVMBuildFMul(builder, a, rcp_a, ""); 2684 res = LLVMBuildFSub(builder, two, res, ""); 2685 res = LLVMBuildFMul(builder, rcp_a, res, ""); 2686 2687 return res; 2688 } 2689 2690 2691 LLVMValueRef 2692 lp_build_rcp(struct lp_build_context *bld, 2693 LLVMValueRef a) 2694 { 2695 LLVMBuilderRef builder = bld->gallivm->builder; 2696 const struct lp_type type = bld->type; 2697 2698 assert(lp_check_value(type, a)); 2699 2700 if(a == bld->zero) 2701 return bld->undef; 2702 if(a == bld->one) 2703 return bld->one; 2704 if(a == bld->undef) 2705 return bld->undef; 2706 2707 assert(type.floating); 2708 2709 if(LLVMIsConstant(a)) 2710 return LLVMConstFDiv(bld->one, a); 2711 2712 /* 2713 * We don't use RCPPS because: 2714 * - it only has 10bits of precision 2715 * - it doesn't even get the reciprocate of 1.0 exactly 2716 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 2717 * - for recent processors the benefit over DIVPS is marginal, a case 2718 * dependent 2719 * 2720 * We could still use it on certain processors if benchmarks show that the 2721 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 2722 * particular uses that require less workarounds. 2723 */ 2724 2725 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 2726 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ 2727 const unsigned num_iterations = 0; 2728 LLVMValueRef res; 2729 unsigned i; 2730 const char *intrinsic = NULL; 2731 2732 if (type.length == 4) { 2733 intrinsic = "llvm.x86.sse.rcp.ps"; 2734 } 2735 else { 2736 intrinsic = "llvm.x86.avx.rcp.ps.256"; 2737 } 2738 2739 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2740 2741 for (i = 0; i < num_iterations; ++i) { 2742 res = lp_build_rcp_refine(bld, a, res); 2743 } 2744 2745 return res; 2746 } 2747 2748 return LLVMBuildFDiv(builder, bld->one, a, ""); 2749 } 2750 2751 2752 /** 2753 * Do one Newton-Raphson step to improve rsqrt precision: 2754 * 2755 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 2756 * 2757 * See also Intel 64 and IA-32 Architectures Optimization Manual. 2758 */ 2759 static inline LLVMValueRef 2760 lp_build_rsqrt_refine(struct lp_build_context *bld, 2761 LLVMValueRef a, 2762 LLVMValueRef rsqrt_a) 2763 { 2764 LLVMBuilderRef builder = bld->gallivm->builder; 2765 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 2766 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 2767 LLVMValueRef res; 2768 2769 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 2770 res = LLVMBuildFMul(builder, a, res, ""); 2771 res = LLVMBuildFSub(builder, three, res, ""); 2772 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 2773 res = LLVMBuildFMul(builder, half, res, ""); 2774 2775 return res; 2776 } 2777 2778 2779 /** 2780 * Generate 1/sqrt(a). 2781 * Result is undefined for values < 0, infinity for +0. 2782 */ 2783 LLVMValueRef 2784 lp_build_rsqrt(struct lp_build_context *bld, 2785 LLVMValueRef a) 2786 { 2787 const struct lp_type type = bld->type; 2788 2789 assert(lp_check_value(type, a)); 2790 2791 assert(type.floating); 2792 2793 /* 2794 * This should be faster but all denormals will end up as infinity. 2795 */ 2796 if (0 && lp_build_fast_rsqrt_available(type)) { 2797 const unsigned num_iterations = 1; 2798 LLVMValueRef res; 2799 unsigned i; 2800 2801 /* rsqrt(1.0) != 1.0 here */ 2802 res = lp_build_fast_rsqrt(bld, a); 2803 2804 if (num_iterations) { 2805 /* 2806 * Newton-Raphson will result in NaN instead of infinity for zero, 2807 * and NaN instead of zero for infinity. 2808 * Also, need to ensure rsqrt(1.0) == 1.0. 2809 * All numbers smaller than FLT_MIN will result in +infinity 2810 * (rsqrtps treats all denormals as zero). 2811 */ 2812 LLVMValueRef cmp; 2813 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN); 2814 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY); 2815 2816 for (i = 0; i < num_iterations; ++i) { 2817 res = lp_build_rsqrt_refine(bld, a, res); 2818 } 2819 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min); 2820 res = lp_build_select(bld, cmp, inf, res); 2821 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf); 2822 res = lp_build_select(bld, cmp, bld->zero, res); 2823 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one); 2824 res = lp_build_select(bld, cmp, bld->one, res); 2825 } 2826 2827 return res; 2828 } 2829 2830 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2831 } 2832 2833 /** 2834 * If there's a fast (inaccurate) rsqrt instruction available 2835 * (caller may want to avoid to call rsqrt_fast if it's not available, 2836 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if 2837 * unavailable it would result in sqrt/div/mul so obviously 2838 * much better to just call sqrt, skipping both div and mul). 2839 */ 2840 boolean 2841 lp_build_fast_rsqrt_available(struct lp_type type) 2842 { 2843 assert(type.floating); 2844 2845 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 2846 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 2847 return true; 2848 } 2849 return false; 2850 } 2851 2852 2853 /** 2854 * Generate 1/sqrt(a). 2855 * Result is undefined for values < 0, infinity for +0. 2856 * Precision is limited, only ~10 bits guaranteed 2857 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). 2858 */ 2859 LLVMValueRef 2860 lp_build_fast_rsqrt(struct lp_build_context *bld, 2861 LLVMValueRef a) 2862 { 2863 LLVMBuilderRef builder = bld->gallivm->builder; 2864 const struct lp_type type = bld->type; 2865 2866 assert(lp_check_value(type, a)); 2867 2868 if (lp_build_fast_rsqrt_available(type)) { 2869 const char *intrinsic = NULL; 2870 2871 if (type.length == 4) { 2872 intrinsic = "llvm.x86.sse.rsqrt.ps"; 2873 } 2874 else { 2875 intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2876 } 2877 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2878 } 2879 else { 2880 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); 2881 } 2882 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2883 } 2884 2885 2886 /** 2887 * Generate sin(a) or cos(a) using polynomial approximation. 2888 * TODO: it might be worth recognizing sin and cos using same source 2889 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time 2890 * would be way cheaper than calculating (nearly) everything twice... 2891 * Not sure it's common enough to be worth bothering however, scs 2892 * opcode could also benefit from calculating both though. 2893 */ 2894 static LLVMValueRef 2895 lp_build_sin_or_cos(struct lp_build_context *bld, 2896 LLVMValueRef a, 2897 boolean cos) 2898 { 2899 struct gallivm_state *gallivm = bld->gallivm; 2900 LLVMBuilderRef b = gallivm->builder; 2901 struct lp_type int_type = lp_int_type(bld->type); 2902 2903 /* 2904 * take the absolute value, 2905 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2906 */ 2907 2908 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2909 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2910 2911 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2912 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2913 2914 /* 2915 * scale by 4/Pi 2916 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2917 */ 2918 2919 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2920 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2921 2922 /* 2923 * store the integer part of y in mm0 2924 * emm2 = _mm_cvttps_epi32(y); 2925 */ 2926 2927 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2928 2929 /* 2930 * j=(j+1) & (~1) (see the cephes sources) 2931 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2932 */ 2933 2934 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2935 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2936 /* 2937 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2938 */ 2939 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2940 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2941 2942 /* 2943 * y = _mm_cvtepi32_ps(emm2); 2944 */ 2945 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2946 2947 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2948 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2949 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2950 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2951 2952 /* 2953 * Argument used for poly selection and sign bit determination 2954 * is different for sin vs. cos. 2955 */ 2956 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") : 2957 emm2_and; 2958 2959 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4, 2960 LLVMBuildNot(b, emm2_2, ""), ""), 2961 const_29, "sign_bit") : 2962 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si, 2963 LLVMBuildShl(b, emm2_add, 2964 const_29, ""), ""), 2965 sign_mask, "sign_bit"); 2966 2967 /* 2968 * get the polynom selection mask 2969 * there is one polynom for 0 <= x <= Pi/4 2970 * and another one for Pi/4<x<=Pi/2 2971 * Both branches will be computed. 2972 * 2973 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2974 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2975 */ 2976 2977 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3"); 2978 LLVMValueRef poly_mask = lp_build_compare(gallivm, 2979 int_type, PIPE_FUNC_EQUAL, 2980 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2981 2982 /* 2983 * _PS_CONST(minus_cephes_DP1, -0.78515625); 2984 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2985 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2986 */ 2987 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2988 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2989 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2990 2991 /* 2992 * The magic pass: "Extended precision modular arithmetic" 2993 * x = ((x - y * DP1) - y * DP2) - y * DP3; 2994 */ 2995 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs); 2996 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1); 2997 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2); 2998 2999 /* 3000 * Evaluate the first polynom (0 <= x <= Pi/4) 3001 * 3002 * z = _mm_mul_ps(x,x); 3003 */ 3004 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 3005 3006 /* 3007 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 3008 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 3009 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 3010 */ 3011 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 3012 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 3013 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 3014 3015 /* 3016 * y = *(v4sf*)_ps_coscof_p0; 3017 * y = _mm_mul_ps(y, z); 3018 */ 3019 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1); 3020 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2); 3021 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 3022 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 3023 3024 3025 /* 3026 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 3027 * y = _mm_sub_ps(y, tmp); 3028 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 3029 */ 3030 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 3031 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 3032 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 3033 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 3034 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 3035 3036 /* 3037 * _PS_CONST(sincof_p0, -1.9515295891E-4); 3038 * _PS_CONST(sincof_p1, 8.3321608736E-3); 3039 * _PS_CONST(sincof_p2, -1.6666654611E-1); 3040 */ 3041 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 3042 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 3043 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 3044 3045 /* 3046 * Evaluate the second polynom (Pi/4 <= x <= 0) 3047 * 3048 * y2 = *(v4sf*)_ps_sincof_p0; 3049 * y2 = _mm_mul_ps(y2, z); 3050 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 3051 * y2 = _mm_mul_ps(y2, z); 3052 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 3053 * y2 = _mm_mul_ps(y2, z); 3054 * y2 = _mm_mul_ps(y2, x); 3055 * y2 = _mm_add_ps(y2, x); 3056 */ 3057 3058 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1); 3059 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2); 3060 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 3061 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3); 3062 3063 /* 3064 * select the correct result from the two polynoms 3065 * xmm3 = poly_mask; 3066 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 3067 * y = _mm_andnot_ps(xmm3, y); 3068 * y = _mm_or_ps(y,y2); 3069 */ 3070 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 3071 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 3072 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 3073 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv"); 3074 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 3075 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine"); 3076 3077 /* 3078 * update the sign 3079 * y = _mm_xor_ps(y, sign_bit); 3080 */ 3081 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign"); 3082 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 3083 3084 LLVMValueRef isfinite = lp_build_isfinite(bld, a); 3085 3086 /* clamp output to be within [-1, 1] */ 3087 y_result = lp_build_clamp(bld, y_result, 3088 lp_build_const_vec(bld->gallivm, bld->type, -1.f), 3089 lp_build_const_vec(bld->gallivm, bld->type, 1.f)); 3090 /* If a is -inf, inf or NaN then return NaN */ 3091 y_result = lp_build_select(bld, isfinite, y_result, 3092 lp_build_const_vec(bld->gallivm, bld->type, NAN)); 3093 return y_result; 3094 } 3095 3096 3097 /** 3098 * Generate sin(a) 3099 */ 3100 LLVMValueRef 3101 lp_build_sin(struct lp_build_context *bld, 3102 LLVMValueRef a) 3103 { 3104 return lp_build_sin_or_cos(bld, a, FALSE); 3105 } 3106 3107 3108 /** 3109 * Generate cos(a) 3110 */ 3111 LLVMValueRef 3112 lp_build_cos(struct lp_build_context *bld, 3113 LLVMValueRef a) 3114 { 3115 return lp_build_sin_or_cos(bld, a, TRUE); 3116 } 3117 3118 3119 /** 3120 * Generate pow(x, y) 3121 */ 3122 LLVMValueRef 3123 lp_build_pow(struct lp_build_context *bld, 3124 LLVMValueRef x, 3125 LLVMValueRef y) 3126 { 3127 /* TODO: optimize the constant case */ 3128 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3129 LLVMIsConstant(x) && LLVMIsConstant(y)) { 3130 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3131 __FUNCTION__); 3132 } 3133 3134 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 3135 } 3136 3137 3138 /** 3139 * Generate exp(x) 3140 */ 3141 LLVMValueRef 3142 lp_build_exp(struct lp_build_context *bld, 3143 LLVMValueRef x) 3144 { 3145 /* log2(e) = 1/log(2) */ 3146 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 3147 1.4426950408889634); 3148 3149 assert(lp_check_value(bld->type, x)); 3150 3151 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 3152 } 3153 3154 3155 /** 3156 * Generate log(x) 3157 * Behavior is undefined with infs, 0s and nans 3158 */ 3159 LLVMValueRef 3160 lp_build_log(struct lp_build_context *bld, 3161 LLVMValueRef x) 3162 { 3163 /* log(2) */ 3164 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3165 0.69314718055994529); 3166 3167 assert(lp_check_value(bld->type, x)); 3168 3169 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 3170 } 3171 3172 /** 3173 * Generate log(x) that handles edge cases (infs, 0s and nans) 3174 */ 3175 LLVMValueRef 3176 lp_build_log_safe(struct lp_build_context *bld, 3177 LLVMValueRef x) 3178 { 3179 /* log(2) */ 3180 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3181 0.69314718055994529); 3182 3183 assert(lp_check_value(bld->type, x)); 3184 3185 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x)); 3186 } 3187 3188 3189 /** 3190 * Generate polynomial. 3191 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 3192 */ 3193 LLVMValueRef 3194 lp_build_polynomial(struct lp_build_context *bld, 3195 LLVMValueRef x, 3196 const double *coeffs, 3197 unsigned num_coeffs) 3198 { 3199 const struct lp_type type = bld->type; 3200 LLVMValueRef even = NULL, odd = NULL; 3201 LLVMValueRef x2; 3202 unsigned i; 3203 3204 assert(lp_check_value(bld->type, x)); 3205 3206 /* TODO: optimize the constant case */ 3207 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3208 LLVMIsConstant(x)) { 3209 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3210 __FUNCTION__); 3211 } 3212 3213 /* 3214 * Calculate odd and even terms seperately to decrease data dependency 3215 * Ex: 3216 * c[0] + x^2 * c[2] + x^4 * c[4] ... 3217 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 3218 */ 3219 x2 = lp_build_mul(bld, x, x); 3220 3221 for (i = num_coeffs; i--; ) { 3222 LLVMValueRef coeff; 3223 3224 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 3225 3226 if (i % 2 == 0) { 3227 if (even) 3228 even = lp_build_mad(bld, x2, even, coeff); 3229 else 3230 even = coeff; 3231 } else { 3232 if (odd) 3233 odd = lp_build_mad(bld, x2, odd, coeff); 3234 else 3235 odd = coeff; 3236 } 3237 } 3238 3239 if (odd) 3240 return lp_build_mad(bld, odd, x, even); 3241 else if (even) 3242 return even; 3243 else 3244 return bld->undef; 3245 } 3246 3247 3248 /** 3249 * Minimax polynomial fit of 2**x, in range [0, 1[ 3250 */ 3251 const double lp_build_exp2_polynomial[] = { 3252 #if EXP_POLY_DEGREE == 5 3253 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */ 3254 0.693153073200168932794, 3255 0.240153617044375388211, 3256 0.0558263180532956664775, 3257 0.00898934009049466391101, 3258 0.00187757667519147912699 3259 #elif EXP_POLY_DEGREE == 4 3260 1.00000259337069434683, 3261 0.693003834469974940458, 3262 0.24144275689150793076, 3263 0.0520114606103070150235, 3264 0.0135341679161270268764 3265 #elif EXP_POLY_DEGREE == 3 3266 0.999925218562710312959, 3267 0.695833540494823811697, 3268 0.226067155427249155588, 3269 0.0780245226406372992967 3270 #elif EXP_POLY_DEGREE == 2 3271 1.00172476321474503578, 3272 0.657636275736077639316, 3273 0.33718943461968720704 3274 #else 3275 #error 3276 #endif 3277 }; 3278 3279 3280 LLVMValueRef 3281 lp_build_exp2(struct lp_build_context *bld, 3282 LLVMValueRef x) 3283 { 3284 LLVMBuilderRef builder = bld->gallivm->builder; 3285 const struct lp_type type = bld->type; 3286 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3287 LLVMValueRef ipart = NULL; 3288 LLVMValueRef fpart = NULL; 3289 LLVMValueRef expipart = NULL; 3290 LLVMValueRef expfpart = NULL; 3291 LLVMValueRef res = NULL; 3292 3293 assert(lp_check_value(bld->type, x)); 3294 3295 /* TODO: optimize the constant case */ 3296 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3297 LLVMIsConstant(x)) { 3298 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3299 __FUNCTION__); 3300 } 3301 3302 assert(type.floating && type.width == 32); 3303 3304 /* We want to preserve NaN and make sure than for exp2 if x > 128, 3305 * the result is INF and if it's smaller than -126.9 the result is 0 */ 3306 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x, 3307 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3308 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), 3309 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3310 3311 /* ipart = floor(x) */ 3312 /* fpart = x - ipart */ 3313 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 3314 3315 /* expipart = (float) (1 << ipart) */ 3316 expipart = LLVMBuildAdd(builder, ipart, 3317 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3318 expipart = LLVMBuildShl(builder, expipart, 3319 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3320 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 3321 3322 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 3323 ARRAY_SIZE(lp_build_exp2_polynomial)); 3324 3325 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 3326 3327 return res; 3328 } 3329 3330 3331 3332 /** 3333 * Extract the exponent of a IEEE-754 floating point value. 3334 * 3335 * Optionally apply an integer bias. 3336 * 3337 * Result is an integer value with 3338 * 3339 * ifloor(log2(x)) + bias 3340 */ 3341 LLVMValueRef 3342 lp_build_extract_exponent(struct lp_build_context *bld, 3343 LLVMValueRef x, 3344 int bias) 3345 { 3346 LLVMBuilderRef builder = bld->gallivm->builder; 3347 const struct lp_type type = bld->type; 3348 unsigned mantissa = lp_mantissa(type); 3349 LLVMValueRef res; 3350 3351 assert(type.floating); 3352 3353 assert(lp_check_value(bld->type, x)); 3354 3355 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3356 3357 res = LLVMBuildLShr(builder, x, 3358 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 3359 res = LLVMBuildAnd(builder, res, 3360 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 3361 res = LLVMBuildSub(builder, res, 3362 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 3363 3364 return res; 3365 } 3366 3367 3368 /** 3369 * Extract the mantissa of the a floating. 3370 * 3371 * Result is a floating point value with 3372 * 3373 * x / floor(log2(x)) 3374 */ 3375 LLVMValueRef 3376 lp_build_extract_mantissa(struct lp_build_context *bld, 3377 LLVMValueRef x) 3378 { 3379 LLVMBuilderRef builder = bld->gallivm->builder; 3380 const struct lp_type type = bld->type; 3381 unsigned mantissa = lp_mantissa(type); 3382 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 3383 (1ULL << mantissa) - 1); 3384 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 3385 LLVMValueRef res; 3386 3387 assert(lp_check_value(bld->type, x)); 3388 3389 assert(type.floating); 3390 3391 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3392 3393 /* res = x / 2**ipart */ 3394 res = LLVMBuildAnd(builder, x, mantmask, ""); 3395 res = LLVMBuildOr(builder, res, one, ""); 3396 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 3397 3398 return res; 3399 } 3400 3401 3402 3403 /** 3404 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 3405 * These coefficients can be generate with 3406 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 3407 */ 3408 const double lp_build_log2_polynomial[] = { 3409 #if LOG_POLY_DEGREE == 5 3410 2.88539008148777786488L, 3411 0.961796878841293367824L, 3412 0.577058946784739859012L, 3413 0.412914355135828735411L, 3414 0.308591899232910175289L, 3415 0.352376952300281371868L, 3416 #elif LOG_POLY_DEGREE == 4 3417 2.88539009343309178325L, 3418 0.961791550404184197881L, 3419 0.577440339438736392009L, 3420 0.403343858251329912514L, 3421 0.406718052498846252698L, 3422 #elif LOG_POLY_DEGREE == 3 3423 2.88538959748872753838L, 3424 0.961932915889597772928L, 3425 0.571118517972136195241L, 3426 0.493997535084709500285L, 3427 #else 3428 #error 3429 #endif 3430 }; 3431 3432 /** 3433 * See http://www.devmaster.net/forums/showthread.php?p=43580 3434 * http://en.wikipedia.org/wiki/Logarithm#Calculation 3435 * http://www.nezumi.demon.co.uk/consult/logx.htm 3436 * 3437 * If handle_edge_cases is true the function will perform computations 3438 * to match the required D3D10+ behavior for each of the edge cases. 3439 * That means that if input is: 3440 * - less than zero (to and including -inf) then NaN will be returned 3441 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned 3442 * - +infinity, then +infinity will be returned 3443 * - NaN, then NaN will be returned 3444 * 3445 * Those checks are fairly expensive so if you don't need them make sure 3446 * handle_edge_cases is false. 3447 */ 3448 void 3449 lp_build_log2_approx(struct lp_build_context *bld, 3450 LLVMValueRef x, 3451 LLVMValueRef *p_exp, 3452 LLVMValueRef *p_floor_log2, 3453 LLVMValueRef *p_log2, 3454 boolean handle_edge_cases) 3455 { 3456 LLVMBuilderRef builder = bld->gallivm->builder; 3457 const struct lp_type type = bld->type; 3458 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3459 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 3460 3461 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 3462 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 3463 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 3464 3465 LLVMValueRef i = NULL; 3466 LLVMValueRef y = NULL; 3467 LLVMValueRef z = NULL; 3468 LLVMValueRef exp = NULL; 3469 LLVMValueRef mant = NULL; 3470 LLVMValueRef logexp = NULL; 3471 LLVMValueRef p_z = NULL; 3472 LLVMValueRef res = NULL; 3473 3474 assert(lp_check_value(bld->type, x)); 3475 3476 if(p_exp || p_floor_log2 || p_log2) { 3477 /* TODO: optimize the constant case */ 3478 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3479 LLVMIsConstant(x)) { 3480 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3481 __FUNCTION__); 3482 } 3483 3484 assert(type.floating && type.width == 32); 3485 3486 /* 3487 * We don't explicitly handle denormalized numbers. They will yield a 3488 * result in the neighbourhood of -127, which appears to be adequate 3489 * enough. 3490 */ 3491 3492 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3493 3494 /* exp = (float) exponent(x) */ 3495 exp = LLVMBuildAnd(builder, i, expmask, ""); 3496 } 3497 3498 if(p_floor_log2 || p_log2) { 3499 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3500 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3501 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 3502 } 3503 3504 if (p_log2) { 3505 /* mant = 1 + (float) mantissa(x) */ 3506 mant = LLVMBuildAnd(builder, i, mantmask, ""); 3507 mant = LLVMBuildOr(builder, mant, one, ""); 3508 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 3509 3510 /* y = (mant - 1) / (mant + 1) */ 3511 y = lp_build_div(bld, 3512 lp_build_sub(bld, mant, bld->one), 3513 lp_build_add(bld, mant, bld->one) 3514 ); 3515 3516 /* z = y^2 */ 3517 z = lp_build_mul(bld, y, y); 3518 3519 /* compute P(z) */ 3520 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 3521 ARRAY_SIZE(lp_build_log2_polynomial)); 3522 3523 /* y * P(z) + logexp */ 3524 res = lp_build_mad(bld, y, p_z, logexp); 3525 3526 if (type.floating && handle_edge_cases) { 3527 LLVMValueRef negmask, infmask, zmask; 3528 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x, 3529 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3530 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, 3531 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3532 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x, 3533 lp_build_const_vec(bld->gallivm, type, INFINITY)); 3534 3535 /* If x is qual to inf make sure we return inf */ 3536 res = lp_build_select(bld, infmask, 3537 lp_build_const_vec(bld->gallivm, type, INFINITY), 3538 res); 3539 /* If x is qual to 0, return -inf */ 3540 res = lp_build_select(bld, zmask, 3541 lp_build_const_vec(bld->gallivm, type, -INFINITY), 3542 res); 3543 /* If x is nan or less than 0, return nan */ 3544 res = lp_build_select(bld, negmask, 3545 lp_build_const_vec(bld->gallivm, type, NAN), 3546 res); 3547 } 3548 } 3549 3550 if (p_exp) { 3551 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 3552 *p_exp = exp; 3553 } 3554 3555 if (p_floor_log2) 3556 *p_floor_log2 = logexp; 3557 3558 if (p_log2) 3559 *p_log2 = res; 3560 } 3561 3562 3563 /* 3564 * log2 implementation which doesn't have special code to 3565 * handle edge cases (-inf, 0, inf, NaN). It's faster but 3566 * the results for those cases are undefined. 3567 */ 3568 LLVMValueRef 3569 lp_build_log2(struct lp_build_context *bld, 3570 LLVMValueRef x) 3571 { 3572 LLVMValueRef res; 3573 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE); 3574 return res; 3575 } 3576 3577 /* 3578 * Version of log2 which handles all edge cases. 3579 * Look at documentation of lp_build_log2_approx for 3580 * description of the behavior for each of the edge cases. 3581 */ 3582 LLVMValueRef 3583 lp_build_log2_safe(struct lp_build_context *bld, 3584 LLVMValueRef x) 3585 { 3586 LLVMValueRef res; 3587 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE); 3588 return res; 3589 } 3590 3591 3592 /** 3593 * Faster (and less accurate) log2. 3594 * 3595 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 3596 * 3597 * Piece-wise linear approximation, with exact results when x is a 3598 * power of two. 3599 * 3600 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 3601 */ 3602 LLVMValueRef 3603 lp_build_fast_log2(struct lp_build_context *bld, 3604 LLVMValueRef x) 3605 { 3606 LLVMBuilderRef builder = bld->gallivm->builder; 3607 LLVMValueRef ipart; 3608 LLVMValueRef fpart; 3609 3610 assert(lp_check_value(bld->type, x)); 3611 3612 assert(bld->type.floating); 3613 3614 /* ipart = floor(log2(x)) - 1 */ 3615 ipart = lp_build_extract_exponent(bld, x, -1); 3616 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 3617 3618 /* fpart = x / 2**ipart */ 3619 fpart = lp_build_extract_mantissa(bld, x); 3620 3621 /* ipart + fpart */ 3622 return LLVMBuildFAdd(builder, ipart, fpart, ""); 3623 } 3624 3625 3626 /** 3627 * Fast implementation of iround(log2(x)). 3628 * 3629 * Not an approximation -- it should give accurate results all the time. 3630 */ 3631 LLVMValueRef 3632 lp_build_ilog2(struct lp_build_context *bld, 3633 LLVMValueRef x) 3634 { 3635 LLVMBuilderRef builder = bld->gallivm->builder; 3636 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 3637 LLVMValueRef ipart; 3638 3639 assert(bld->type.floating); 3640 3641 assert(lp_check_value(bld->type, x)); 3642 3643 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 3644 x = LLVMBuildFMul(builder, x, sqrt2, ""); 3645 3646 /* ipart = floor(log2(x) + 0.5) */ 3647 ipart = lp_build_extract_exponent(bld, x, 0); 3648 3649 return ipart; 3650 } 3651 3652 LLVMValueRef 3653 lp_build_mod(struct lp_build_context *bld, 3654 LLVMValueRef x, 3655 LLVMValueRef y) 3656 { 3657 LLVMBuilderRef builder = bld->gallivm->builder; 3658 LLVMValueRef res; 3659 const struct lp_type type = bld->type; 3660 3661 assert(lp_check_value(type, x)); 3662 assert(lp_check_value(type, y)); 3663 3664 if (type.floating) 3665 res = LLVMBuildFRem(builder, x, y, ""); 3666 else if (type.sign) 3667 res = LLVMBuildSRem(builder, x, y, ""); 3668 else 3669 res = LLVMBuildURem(builder, x, y, ""); 3670 return res; 3671 } 3672 3673 3674 /* 3675 * For floating inputs it creates and returns a mask 3676 * which is all 1's for channels which are NaN. 3677 * Channels inside x which are not NaN will be 0. 3678 */ 3679 LLVMValueRef 3680 lp_build_isnan(struct lp_build_context *bld, 3681 LLVMValueRef x) 3682 { 3683 LLVMValueRef mask; 3684 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3685 3686 assert(bld->type.floating); 3687 assert(lp_check_value(bld->type, x)); 3688 3689 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x, 3690 "isnotnan"); 3691 mask = LLVMBuildNot(bld->gallivm->builder, mask, ""); 3692 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan"); 3693 return mask; 3694 } 3695 3696 /* Returns all 1's for floating point numbers that are 3697 * finite numbers and returns all zeros for -inf, 3698 * inf and nan's */ 3699 LLVMValueRef 3700 lp_build_isfinite(struct lp_build_context *bld, 3701 LLVMValueRef x) 3702 { 3703 LLVMBuilderRef builder = bld->gallivm->builder; 3704 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3705 struct lp_type int_type = lp_int_type(bld->type); 3706 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3707 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type, 3708 0x7f800000); 3709 3710 if (!bld->type.floating) { 3711 return lp_build_const_int_vec(bld->gallivm, bld->type, 0); 3712 } 3713 assert(bld->type.floating); 3714 assert(lp_check_value(bld->type, x)); 3715 assert(bld->type.width == 32); 3716 3717 intx = LLVMBuildAnd(builder, intx, infornan32, ""); 3718 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL, 3719 intx, infornan32); 3720 } 3721 3722 /* 3723 * Returns true if the number is nan or inf and false otherwise. 3724 * The input has to be a floating point vector. 3725 */ 3726 LLVMValueRef 3727 lp_build_is_inf_or_nan(struct gallivm_state *gallivm, 3728 const struct lp_type type, 3729 LLVMValueRef x) 3730 { 3731 LLVMBuilderRef builder = gallivm->builder; 3732 struct lp_type int_type = lp_int_type(type); 3733 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type, 3734 0x7f800000); 3735 LLVMValueRef ret; 3736 3737 assert(type.floating); 3738 3739 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), ""); 3740 ret = LLVMBuildAnd(builder, ret, const0, ""); 3741 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL, 3742 ret, const0); 3743 3744 return ret; 3745 } 3746 3747 3748 LLVMValueRef 3749 lp_build_fpstate_get(struct gallivm_state *gallivm) 3750 { 3751 if (util_cpu_caps.has_sse) { 3752 LLVMBuilderRef builder = gallivm->builder; 3753 LLVMValueRef mxcsr_ptr = lp_build_alloca( 3754 gallivm, 3755 LLVMInt32TypeInContext(gallivm->context), 3756 "mxcsr_ptr"); 3757 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr, 3758 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3759 lp_build_intrinsic(builder, 3760 "llvm.x86.sse.stmxcsr", 3761 LLVMVoidTypeInContext(gallivm->context), 3762 &mxcsr_ptr8, 1, 0); 3763 return mxcsr_ptr; 3764 } 3765 return 0; 3766 } 3767 3768 void 3769 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, 3770 boolean zero) 3771 { 3772 if (util_cpu_caps.has_sse) { 3773 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ 3774 int daz_ftz = _MM_FLUSH_ZERO_MASK; 3775 3776 LLVMBuilderRef builder = gallivm->builder; 3777 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm); 3778 LLVMValueRef mxcsr = 3779 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr"); 3780 3781 if (util_cpu_caps.has_daz) { 3782 /* Enable denormals are zero mode */ 3783 daz_ftz |= _MM_DENORMALS_ZERO_MASK; 3784 } 3785 if (zero) { 3786 mxcsr = LLVMBuildOr(builder, mxcsr, 3787 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), ""); 3788 } else { 3789 mxcsr = LLVMBuildAnd(builder, mxcsr, 3790 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), ""); 3791 } 3792 3793 LLVMBuildStore(builder, mxcsr, mxcsr_ptr); 3794 lp_build_fpstate_set(gallivm, mxcsr_ptr); 3795 } 3796 } 3797 3798 void 3799 lp_build_fpstate_set(struct gallivm_state *gallivm, 3800 LLVMValueRef mxcsr_ptr) 3801 { 3802 if (util_cpu_caps.has_sse) { 3803 LLVMBuilderRef builder = gallivm->builder; 3804 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, 3805 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3806 lp_build_intrinsic(builder, 3807 "llvm.x86.sse.ldmxcsr", 3808 LLVMVoidTypeInContext(gallivm->context), 3809 &mxcsr_ptr, 1, 0); 3810 } 3811 } 3812