1 /************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29 /** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca (at) vmware.com> 45 */ 46 47 48 #include "util/u_memory.h" 49 #include "util/u_debug.h" 50 #include "util/u_math.h" 51 #include "util/u_string.h" 52 #include "util/u_cpu_detect.h" 53 54 #include "lp_bld_type.h" 55 #include "lp_bld_const.h" 56 #include "lp_bld_init.h" 57 #include "lp_bld_intr.h" 58 #include "lp_bld_logic.h" 59 #include "lp_bld_pack.h" 60 #include "lp_bld_debug.h" 61 #include "lp_bld_arit.h" 62 63 64 #define EXP_POLY_DEGREE 5 65 66 #define LOG_POLY_DEGREE 4 67 68 69 /** 70 * Generate min(a, b) 71 * No checks for special case values of a or b = 1 or 0 are done. 72 */ 73 static LLVMValueRef 74 lp_build_min_simple(struct lp_build_context *bld, 75 LLVMValueRef a, 76 LLVMValueRef b) 77 { 78 const struct lp_type type = bld->type; 79 const char *intrinsic = NULL; 80 unsigned intr_size = 0; 81 LLVMValueRef cond; 82 83 assert(lp_check_value(type, a)); 84 assert(lp_check_value(type, b)); 85 86 /* TODO: optimize the constant case */ 87 88 if (type.floating && util_cpu_caps.has_sse) { 89 if (type.width == 32) { 90 if (type.length == 1) { 91 intrinsic = "llvm.x86.sse.min.ss"; 92 intr_size = 128; 93 } 94 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 95 intrinsic = "llvm.x86.sse.min.ps"; 96 intr_size = 128; 97 } 98 else { 99 intrinsic = "llvm.x86.avx.min.ps.256"; 100 intr_size = 256; 101 } 102 } 103 if (type.width == 64 && util_cpu_caps.has_sse2) { 104 if (type.length == 1) { 105 intrinsic = "llvm.x86.sse2.min.sd"; 106 intr_size = 128; 107 } 108 else if (type.length == 2 || !util_cpu_caps.has_avx) { 109 intrinsic = "llvm.x86.sse2.min.pd"; 110 intr_size = 128; 111 } 112 else { 113 intrinsic = "llvm.x86.avx.min.pd.256"; 114 intr_size = 256; 115 } 116 } 117 } 118 else if (util_cpu_caps.has_sse2 && type.length >= 2) { 119 intr_size = 128; 120 if ((type.width == 8 || type.width == 16) && 121 (type.width * type.length <= 64) && 122 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 123 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 124 __FUNCTION__); 125 } 126 if (type.width == 8 && !type.sign) { 127 intrinsic = "llvm.x86.sse2.pminu.b"; 128 } 129 else if (type.width == 16 && type.sign) { 130 intrinsic = "llvm.x86.sse2.pmins.w"; 131 } 132 if (util_cpu_caps.has_sse4_1) { 133 if (type.width == 8 && type.sign) { 134 intrinsic = "llvm.x86.sse41.pminsb"; 135 } 136 if (type.width == 16 && !type.sign) { 137 intrinsic = "llvm.x86.sse41.pminuw"; 138 } 139 if (type.width == 32 && !type.sign) { 140 intrinsic = "llvm.x86.sse41.pminud"; 141 } 142 if (type.width == 32 && type.sign) { 143 intrinsic = "llvm.x86.sse41.pminsd"; 144 } 145 } 146 } 147 148 if(intrinsic) { 149 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 150 type, 151 intr_size, a, b); 152 } 153 154 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 155 return lp_build_select(bld, cond, a, b); 156 } 157 158 159 /** 160 * Generate max(a, b) 161 * No checks for special case values of a or b = 1 or 0 are done. 162 */ 163 static LLVMValueRef 164 lp_build_max_simple(struct lp_build_context *bld, 165 LLVMValueRef a, 166 LLVMValueRef b) 167 { 168 const struct lp_type type = bld->type; 169 const char *intrinsic = NULL; 170 unsigned intr_size = 0; 171 LLVMValueRef cond; 172 173 assert(lp_check_value(type, a)); 174 assert(lp_check_value(type, b)); 175 176 /* TODO: optimize the constant case */ 177 178 if (type.floating && util_cpu_caps.has_sse) { 179 if (type.width == 32) { 180 if (type.length == 1) { 181 intrinsic = "llvm.x86.sse.max.ss"; 182 intr_size = 128; 183 } 184 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 185 intrinsic = "llvm.x86.sse.max.ps"; 186 intr_size = 128; 187 } 188 else { 189 intrinsic = "llvm.x86.avx.max.ps.256"; 190 intr_size = 256; 191 } 192 } 193 if (type.width == 64 && util_cpu_caps.has_sse2) { 194 if (type.length == 1) { 195 intrinsic = "llvm.x86.sse2.max.sd"; 196 intr_size = 128; 197 } 198 else if (type.length == 2 || !util_cpu_caps.has_avx) { 199 intrinsic = "llvm.x86.sse2.max.pd"; 200 intr_size = 128; 201 } 202 else { 203 intrinsic = "llvm.x86.avx.max.pd.256"; 204 intr_size = 256; 205 } 206 } 207 } 208 else if (util_cpu_caps.has_sse2 && type.length >= 2) { 209 intr_size = 128; 210 if ((type.width == 8 || type.width == 16) && 211 (type.width * type.length <= 64) && 212 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 213 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 214 __FUNCTION__); 215 } 216 if (type.width == 8 && !type.sign) { 217 intrinsic = "llvm.x86.sse2.pmaxu.b"; 218 intr_size = 128; 219 } 220 else if (type.width == 16 && type.sign) { 221 intrinsic = "llvm.x86.sse2.pmaxs.w"; 222 } 223 if (util_cpu_caps.has_sse4_1) { 224 if (type.width == 8 && type.sign) { 225 intrinsic = "llvm.x86.sse41.pmaxsb"; 226 } 227 if (type.width == 16 && !type.sign) { 228 intrinsic = "llvm.x86.sse41.pmaxuw"; 229 } 230 if (type.width == 32 && !type.sign) { 231 intrinsic = "llvm.x86.sse41.pmaxud"; 232 } 233 if (type.width == 32 && type.sign) { 234 intrinsic = "llvm.x86.sse41.pmaxsd"; 235 } 236 } 237 } 238 239 if(intrinsic) { 240 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 241 type, 242 intr_size, a, b); 243 } 244 245 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 246 return lp_build_select(bld, cond, a, b); 247 } 248 249 250 /** 251 * Generate 1 - a, or ~a depending on bld->type. 252 */ 253 LLVMValueRef 254 lp_build_comp(struct lp_build_context *bld, 255 LLVMValueRef a) 256 { 257 LLVMBuilderRef builder = bld->gallivm->builder; 258 const struct lp_type type = bld->type; 259 260 assert(lp_check_value(type, a)); 261 262 if(a == bld->one) 263 return bld->zero; 264 if(a == bld->zero) 265 return bld->one; 266 267 if(type.norm && !type.floating && !type.fixed && !type.sign) { 268 if(LLVMIsConstant(a)) 269 return LLVMConstNot(a); 270 else 271 return LLVMBuildNot(builder, a, ""); 272 } 273 274 if(LLVMIsConstant(a)) 275 if (type.floating) 276 return LLVMConstFSub(bld->one, a); 277 else 278 return LLVMConstSub(bld->one, a); 279 else 280 if (type.floating) 281 return LLVMBuildFSub(builder, bld->one, a, ""); 282 else 283 return LLVMBuildSub(builder, bld->one, a, ""); 284 } 285 286 287 /** 288 * Generate a + b 289 */ 290 LLVMValueRef 291 lp_build_add(struct lp_build_context *bld, 292 LLVMValueRef a, 293 LLVMValueRef b) 294 { 295 LLVMBuilderRef builder = bld->gallivm->builder; 296 const struct lp_type type = bld->type; 297 LLVMValueRef res; 298 299 assert(lp_check_value(type, a)); 300 assert(lp_check_value(type, b)); 301 302 if(a == bld->zero) 303 return b; 304 if(b == bld->zero) 305 return a; 306 if(a == bld->undef || b == bld->undef) 307 return bld->undef; 308 309 if(bld->type.norm) { 310 const char *intrinsic = NULL; 311 312 if(a == bld->one || b == bld->one) 313 return bld->one; 314 315 if(util_cpu_caps.has_sse2 && 316 type.width * type.length == 128 && 317 !type.floating && !type.fixed) { 318 if(type.width == 8) 319 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 320 if(type.width == 16) 321 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 322 } 323 324 if(intrinsic) 325 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 326 } 327 328 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 329 if (type.floating) 330 res = LLVMConstFAdd(a, b); 331 else 332 res = LLVMConstAdd(a, b); 333 else 334 if (type.floating) 335 res = LLVMBuildFAdd(builder, a, b, ""); 336 else 337 res = LLVMBuildAdd(builder, a, b, ""); 338 339 /* clamp to ceiling of 1.0 */ 340 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 341 res = lp_build_min_simple(bld, res, bld->one); 342 343 /* XXX clamp to floor of -1 or 0??? */ 344 345 return res; 346 } 347 348 349 /** Return the scalar sum of the elements of a. 350 * Should avoid this operation whenever possible. 351 */ 352 LLVMValueRef 353 lp_build_horizontal_add(struct lp_build_context *bld, 354 LLVMValueRef a) 355 { 356 LLVMBuilderRef builder = bld->gallivm->builder; 357 const struct lp_type type = bld->type; 358 LLVMValueRef index, res; 359 unsigned i, length; 360 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 361 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 362 LLVMValueRef vecres, elem2; 363 364 assert(lp_check_value(type, a)); 365 366 if (type.length == 1) { 367 return a; 368 } 369 370 assert(!bld->type.norm); 371 372 /* 373 * for byte vectors can do much better with psadbw. 374 * Using repeated shuffle/adds here. Note with multiple vectors 375 * this can be done more efficiently as outlined in the intel 376 * optimization manual. 377 * Note: could cause data rearrangement if used with smaller element 378 * sizes. 379 */ 380 381 vecres = a; 382 length = type.length / 2; 383 while (length > 1) { 384 LLVMValueRef vec1, vec2; 385 for (i = 0; i < length; i++) { 386 shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 387 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 388 } 389 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 390 LLVMConstVector(shuffles1, length), ""); 391 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 392 LLVMConstVector(shuffles2, length), ""); 393 if (type.floating) { 394 vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 395 } 396 else { 397 vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 398 } 399 length = length >> 1; 400 } 401 402 /* always have vector of size 2 here */ 403 assert(length == 1); 404 405 index = lp_build_const_int32(bld->gallivm, 0); 406 res = LLVMBuildExtractElement(builder, vecres, index, ""); 407 index = lp_build_const_int32(bld->gallivm, 1); 408 elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 409 410 if (type.floating) 411 res = LLVMBuildFAdd(builder, res, elem2, ""); 412 else 413 res = LLVMBuildAdd(builder, res, elem2, ""); 414 415 return res; 416 } 417 418 /** 419 * Return the horizontal sums of 4 float vectors as a float4 vector. 420 * This uses the technique as outlined in Intel Optimization Manual. 421 */ 422 static LLVMValueRef 423 lp_build_horizontal_add4x4f(struct lp_build_context *bld, 424 LLVMValueRef src[4]) 425 { 426 struct gallivm_state *gallivm = bld->gallivm; 427 LLVMBuilderRef builder = gallivm->builder; 428 LLVMValueRef shuffles[4]; 429 LLVMValueRef tmp[4]; 430 LLVMValueRef sumtmp[2], shuftmp[2]; 431 432 /* lower half of regs */ 433 shuffles[0] = lp_build_const_int32(gallivm, 0); 434 shuffles[1] = lp_build_const_int32(gallivm, 1); 435 shuffles[2] = lp_build_const_int32(gallivm, 4); 436 shuffles[3] = lp_build_const_int32(gallivm, 5); 437 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 438 LLVMConstVector(shuffles, 4), ""); 439 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 440 LLVMConstVector(shuffles, 4), ""); 441 442 /* upper half of regs */ 443 shuffles[0] = lp_build_const_int32(gallivm, 2); 444 shuffles[1] = lp_build_const_int32(gallivm, 3); 445 shuffles[2] = lp_build_const_int32(gallivm, 6); 446 shuffles[3] = lp_build_const_int32(gallivm, 7); 447 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 448 LLVMConstVector(shuffles, 4), ""); 449 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 450 LLVMConstVector(shuffles, 4), ""); 451 452 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 453 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 454 455 shuffles[0] = lp_build_const_int32(gallivm, 0); 456 shuffles[1] = lp_build_const_int32(gallivm, 2); 457 shuffles[2] = lp_build_const_int32(gallivm, 4); 458 shuffles[3] = lp_build_const_int32(gallivm, 6); 459 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 460 LLVMConstVector(shuffles, 4), ""); 461 462 shuffles[0] = lp_build_const_int32(gallivm, 1); 463 shuffles[1] = lp_build_const_int32(gallivm, 3); 464 shuffles[2] = lp_build_const_int32(gallivm, 5); 465 shuffles[3] = lp_build_const_int32(gallivm, 7); 466 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 467 LLVMConstVector(shuffles, 4), ""); 468 469 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 470 } 471 472 473 /* 474 * partially horizontally add 2-4 float vectors with length nx4, 475 * i.e. only four adjacent values in each vector will be added, 476 * assuming values are really grouped in 4 which also determines 477 * output order. 478 * 479 * Return a vector of the same length as the initial vectors, 480 * with the excess elements (if any) being undefined. 481 * The element order is independent of number of input vectors. 482 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 483 * the output order thus will be 484 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 485 */ 486 LLVMValueRef 487 lp_build_hadd_partial4(struct lp_build_context *bld, 488 LLVMValueRef vectors[], 489 unsigned num_vecs) 490 { 491 struct gallivm_state *gallivm = bld->gallivm; 492 LLVMBuilderRef builder = gallivm->builder; 493 LLVMValueRef ret_vec; 494 LLVMValueRef tmp[4]; 495 const char *intrinsic = NULL; 496 497 assert(num_vecs >= 2 && num_vecs <= 4); 498 assert(bld->type.floating); 499 500 /* only use this with at least 2 vectors, as it is sort of expensive 501 * (depending on cpu) and we always need two horizontal adds anyway, 502 * so a shuffle/add approach might be better. 503 */ 504 505 tmp[0] = vectors[0]; 506 tmp[1] = vectors[1]; 507 508 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 509 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 510 511 if (util_cpu_caps.has_sse3 && bld->type.width == 32 && 512 bld->type.length == 4) { 513 intrinsic = "llvm.x86.sse3.hadd.ps"; 514 } 515 else if (util_cpu_caps.has_avx && bld->type.width == 32 && 516 bld->type.length == 8) { 517 intrinsic = "llvm.x86.avx.hadd.ps.256"; 518 } 519 if (intrinsic) { 520 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 521 lp_build_vec_type(gallivm, bld->type), 522 tmp[0], tmp[1]); 523 if (num_vecs > 2) { 524 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 525 lp_build_vec_type(gallivm, bld->type), 526 tmp[2], tmp[3]); 527 } 528 else { 529 tmp[1] = tmp[0]; 530 } 531 return lp_build_intrinsic_binary(builder, intrinsic, 532 lp_build_vec_type(gallivm, bld->type), 533 tmp[0], tmp[1]); 534 } 535 536 if (bld->type.length == 4) { 537 ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 538 } 539 else { 540 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 541 unsigned j; 542 unsigned num_iter = bld->type.length / 4; 543 struct lp_type parttype = bld->type; 544 parttype.length = 4; 545 for (j = 0; j < num_iter; j++) { 546 LLVMValueRef partsrc[4]; 547 unsigned i; 548 for (i = 0; i < 4; i++) { 549 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 550 } 551 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 552 } 553 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 554 } 555 return ret_vec; 556 } 557 558 /** 559 * Generate a - b 560 */ 561 LLVMValueRef 562 lp_build_sub(struct lp_build_context *bld, 563 LLVMValueRef a, 564 LLVMValueRef b) 565 { 566 LLVMBuilderRef builder = bld->gallivm->builder; 567 const struct lp_type type = bld->type; 568 LLVMValueRef res; 569 570 assert(lp_check_value(type, a)); 571 assert(lp_check_value(type, b)); 572 573 if(b == bld->zero) 574 return a; 575 if(a == bld->undef || b == bld->undef) 576 return bld->undef; 577 if(a == b) 578 return bld->zero; 579 580 if(bld->type.norm) { 581 const char *intrinsic = NULL; 582 583 if(b == bld->one) 584 return bld->zero; 585 586 if(util_cpu_caps.has_sse2 && 587 type.width * type.length == 128 && 588 !type.floating && !type.fixed) { 589 if(type.width == 8) 590 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 591 if(type.width == 16) 592 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 593 } 594 595 if(intrinsic) 596 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 597 } 598 599 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 600 if (type.floating) 601 res = LLVMConstFSub(a, b); 602 else 603 res = LLVMConstSub(a, b); 604 else 605 if (type.floating) 606 res = LLVMBuildFSub(builder, a, b, ""); 607 else 608 res = LLVMBuildSub(builder, a, b, ""); 609 610 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 611 res = lp_build_max_simple(bld, res, bld->zero); 612 613 return res; 614 } 615 616 617 /** 618 * Normalized 8bit multiplication. 619 * 620 * - alpha plus one 621 * 622 * makes the following approximation to the division (Sree) 623 * 624 * a*b/255 ~= (a*(b + 1)) >> 256 625 * 626 * which is the fastest method that satisfies the following OpenGL criteria 627 * 628 * 0*0 = 0 and 255*255 = 255 629 * 630 * - geometric series 631 * 632 * takes the geometric series approximation to the division 633 * 634 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 635 * 636 * in this case just the first two terms to fit in 16bit arithmetic 637 * 638 * t/255 ~= (t + (t >> 8)) >> 8 639 * 640 * note that just by itself it doesn't satisfies the OpenGL criteria, as 641 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 642 * must be used 643 * 644 * - geometric series plus rounding 645 * 646 * when using a geometric series division instead of truncating the result 647 * use roundoff in the approximation (Jim Blinn) 648 * 649 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 650 * 651 * achieving the exact results 652 * 653 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 654 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 655 * @sa Michael Herf, The "double blend trick", May 2000, 656 * http://www.stereopsis.com/doubleblend.html 657 */ 658 static LLVMValueRef 659 lp_build_mul_u8n(struct gallivm_state *gallivm, 660 struct lp_type i16_type, 661 LLVMValueRef a, LLVMValueRef b) 662 { 663 LLVMBuilderRef builder = gallivm->builder; 664 LLVMValueRef c8; 665 LLVMValueRef ab; 666 667 assert(!i16_type.floating); 668 assert(lp_check_value(i16_type, a)); 669 assert(lp_check_value(i16_type, b)); 670 671 c8 = lp_build_const_int_vec(gallivm, i16_type, 8); 672 673 #if 0 674 675 /* a*b/255 ~= (a*(b + 1)) >> 256 */ 676 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), ""); 677 ab = LLVMBuildMul(builder, a, b, ""); 678 679 #else 680 681 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */ 682 ab = LLVMBuildMul(builder, a, b, ""); 683 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), ""); 684 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), ""); 685 686 #endif 687 688 ab = LLVMBuildLShr(builder, ab, c8, ""); 689 690 return ab; 691 } 692 693 694 /** 695 * Generate a * b 696 */ 697 LLVMValueRef 698 lp_build_mul(struct lp_build_context *bld, 699 LLVMValueRef a, 700 LLVMValueRef b) 701 { 702 LLVMBuilderRef builder = bld->gallivm->builder; 703 const struct lp_type type = bld->type; 704 LLVMValueRef shift; 705 LLVMValueRef res; 706 707 assert(lp_check_value(type, a)); 708 assert(lp_check_value(type, b)); 709 710 if(a == bld->zero) 711 return bld->zero; 712 if(a == bld->one) 713 return b; 714 if(b == bld->zero) 715 return bld->zero; 716 if(b == bld->one) 717 return a; 718 if(a == bld->undef || b == bld->undef) 719 return bld->undef; 720 721 if(!type.floating && !type.fixed && type.norm) { 722 if(type.width == 8) { 723 struct lp_type i16_type = lp_wider_type(type); 724 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 725 726 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah); 727 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh); 728 729 /* PMULLW, PSRLW, PADDW */ 730 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl); 731 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh); 732 733 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh); 734 735 return ab; 736 } 737 738 /* FIXME */ 739 assert(0); 740 } 741 742 if(type.fixed) 743 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 744 else 745 shift = NULL; 746 747 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 748 if (type.floating) 749 res = LLVMConstFMul(a, b); 750 else 751 res = LLVMConstMul(a, b); 752 if(shift) { 753 if(type.sign) 754 res = LLVMConstAShr(res, shift); 755 else 756 res = LLVMConstLShr(res, shift); 757 } 758 } 759 else { 760 if (type.floating) 761 res = LLVMBuildFMul(builder, a, b, ""); 762 else 763 res = LLVMBuildMul(builder, a, b, ""); 764 if(shift) { 765 if(type.sign) 766 res = LLVMBuildAShr(builder, res, shift, ""); 767 else 768 res = LLVMBuildLShr(builder, res, shift, ""); 769 } 770 } 771 772 return res; 773 } 774 775 776 /** 777 * Small vector x scale multiplication optimization. 778 */ 779 LLVMValueRef 780 lp_build_mul_imm(struct lp_build_context *bld, 781 LLVMValueRef a, 782 int b) 783 { 784 LLVMBuilderRef builder = bld->gallivm->builder; 785 LLVMValueRef factor; 786 787 assert(lp_check_value(bld->type, a)); 788 789 if(b == 0) 790 return bld->zero; 791 792 if(b == 1) 793 return a; 794 795 if(b == -1) 796 return lp_build_negate(bld, a); 797 798 if(b == 2 && bld->type.floating) 799 return lp_build_add(bld, a, a); 800 801 if(util_is_power_of_two(b)) { 802 unsigned shift = ffs(b) - 1; 803 804 if(bld->type.floating) { 805 #if 0 806 /* 807 * Power of two multiplication by directly manipulating the exponent. 808 * 809 * XXX: This might not be always faster, it will introduce a small error 810 * for multiplication by zero, and it will produce wrong results 811 * for Inf and NaN. 812 */ 813 unsigned mantissa = lp_mantissa(bld->type); 814 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 815 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 816 a = LLVMBuildAdd(builder, a, factor, ""); 817 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 818 return a; 819 #endif 820 } 821 else { 822 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 823 return LLVMBuildShl(builder, a, factor, ""); 824 } 825 } 826 827 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 828 return lp_build_mul(bld, a, factor); 829 } 830 831 832 /** 833 * Generate a / b 834 */ 835 LLVMValueRef 836 lp_build_div(struct lp_build_context *bld, 837 LLVMValueRef a, 838 LLVMValueRef b) 839 { 840 LLVMBuilderRef builder = bld->gallivm->builder; 841 const struct lp_type type = bld->type; 842 843 assert(lp_check_value(type, a)); 844 assert(lp_check_value(type, b)); 845 846 if(a == bld->zero) 847 return bld->zero; 848 if(a == bld->one) 849 return lp_build_rcp(bld, b); 850 if(b == bld->zero) 851 return bld->undef; 852 if(b == bld->one) 853 return a; 854 if(a == bld->undef || b == bld->undef) 855 return bld->undef; 856 857 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 858 if (type.floating) 859 return LLVMConstFDiv(a, b); 860 else if (type.sign) 861 return LLVMConstSDiv(a, b); 862 else 863 return LLVMConstUDiv(a, b); 864 } 865 866 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 867 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && 868 type.floating) 869 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 870 871 if (type.floating) 872 return LLVMBuildFDiv(builder, a, b, ""); 873 else if (type.sign) 874 return LLVMBuildSDiv(builder, a, b, ""); 875 else 876 return LLVMBuildUDiv(builder, a, b, ""); 877 } 878 879 880 /** 881 * Linear interpolation -- without any checks. 882 * 883 * @sa http://www.stereopsis.com/doubleblend.html 884 */ 885 static INLINE LLVMValueRef 886 lp_build_lerp_simple(struct lp_build_context *bld, 887 LLVMValueRef x, 888 LLVMValueRef v0, 889 LLVMValueRef v1) 890 { 891 LLVMBuilderRef builder = bld->gallivm->builder; 892 LLVMValueRef delta; 893 LLVMValueRef res; 894 895 assert(lp_check_value(bld->type, x)); 896 assert(lp_check_value(bld->type, v0)); 897 assert(lp_check_value(bld->type, v1)); 898 899 delta = lp_build_sub(bld, v1, v0); 900 901 res = lp_build_mul(bld, x, delta); 902 903 res = lp_build_add(bld, v0, res); 904 905 if (bld->type.fixed) { 906 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, 907 * but it will be wrong for other uses. Basically we need a more 908 * powerful lp_type, capable of further distinguishing the values 909 * interpretation from the value storage. */ 910 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), ""); 911 } 912 913 return res; 914 } 915 916 917 /** 918 * Linear interpolation. 919 */ 920 LLVMValueRef 921 lp_build_lerp(struct lp_build_context *bld, 922 LLVMValueRef x, 923 LLVMValueRef v0, 924 LLVMValueRef v1) 925 { 926 LLVMBuilderRef builder = bld->gallivm->builder; 927 const struct lp_type type = bld->type; 928 LLVMValueRef res; 929 930 assert(lp_check_value(type, x)); 931 assert(lp_check_value(type, v0)); 932 assert(lp_check_value(type, v1)); 933 934 if (type.norm) { 935 struct lp_type wide_type; 936 struct lp_build_context wide_bld; 937 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 938 LLVMValueRef shift; 939 940 assert(type.length >= 2); 941 assert(!type.sign); 942 943 /* 944 * Create a wider type, enough to hold the intermediate result of the 945 * multiplication. 946 */ 947 memset(&wide_type, 0, sizeof wide_type); 948 wide_type.fixed = TRUE; 949 wide_type.width = type.width*2; 950 wide_type.length = type.length/2; 951 952 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 953 954 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); 955 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 956 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 957 958 /* 959 * Scale x from [0, 255] to [0, 256] 960 */ 961 962 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1); 963 964 xl = lp_build_add(&wide_bld, xl, 965 LLVMBuildAShr(builder, xl, shift, "")); 966 xh = lp_build_add(&wide_bld, xh, 967 LLVMBuildAShr(builder, xh, shift, "")); 968 969 /* 970 * Lerp both halves. 971 */ 972 973 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); 974 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); 975 976 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); 977 } else { 978 res = lp_build_lerp_simple(bld, x, v0, v1); 979 } 980 981 return res; 982 } 983 984 985 LLVMValueRef 986 lp_build_lerp_2d(struct lp_build_context *bld, 987 LLVMValueRef x, 988 LLVMValueRef y, 989 LLVMValueRef v00, 990 LLVMValueRef v01, 991 LLVMValueRef v10, 992 LLVMValueRef v11) 993 { 994 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01); 995 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11); 996 return lp_build_lerp(bld, y, v0, v1); 997 } 998 999 1000 /** 1001 * Generate min(a, b) 1002 * Do checks for special cases. 1003 */ 1004 LLVMValueRef 1005 lp_build_min(struct lp_build_context *bld, 1006 LLVMValueRef a, 1007 LLVMValueRef b) 1008 { 1009 assert(lp_check_value(bld->type, a)); 1010 assert(lp_check_value(bld->type, b)); 1011 1012 if(a == bld->undef || b == bld->undef) 1013 return bld->undef; 1014 1015 if(a == b) 1016 return a; 1017 1018 if (bld->type.norm) { 1019 if (!bld->type.sign) { 1020 if (a == bld->zero || b == bld->zero) { 1021 return bld->zero; 1022 } 1023 } 1024 if(a == bld->one) 1025 return b; 1026 if(b == bld->one) 1027 return a; 1028 } 1029 1030 return lp_build_min_simple(bld, a, b); 1031 } 1032 1033 1034 /** 1035 * Generate max(a, b) 1036 * Do checks for special cases. 1037 */ 1038 LLVMValueRef 1039 lp_build_max(struct lp_build_context *bld, 1040 LLVMValueRef a, 1041 LLVMValueRef b) 1042 { 1043 assert(lp_check_value(bld->type, a)); 1044 assert(lp_check_value(bld->type, b)); 1045 1046 if(a == bld->undef || b == bld->undef) 1047 return bld->undef; 1048 1049 if(a == b) 1050 return a; 1051 1052 if(bld->type.norm) { 1053 if(a == bld->one || b == bld->one) 1054 return bld->one; 1055 if (!bld->type.sign) { 1056 if (a == bld->zero) { 1057 return b; 1058 } 1059 if (b == bld->zero) { 1060 return a; 1061 } 1062 } 1063 } 1064 1065 return lp_build_max_simple(bld, a, b); 1066 } 1067 1068 1069 /** 1070 * Generate clamp(a, min, max) 1071 * Do checks for special cases. 1072 */ 1073 LLVMValueRef 1074 lp_build_clamp(struct lp_build_context *bld, 1075 LLVMValueRef a, 1076 LLVMValueRef min, 1077 LLVMValueRef max) 1078 { 1079 assert(lp_check_value(bld->type, a)); 1080 assert(lp_check_value(bld->type, min)); 1081 assert(lp_check_value(bld->type, max)); 1082 1083 a = lp_build_min(bld, a, max); 1084 a = lp_build_max(bld, a, min); 1085 return a; 1086 } 1087 1088 1089 /** 1090 * Generate abs(a) 1091 */ 1092 LLVMValueRef 1093 lp_build_abs(struct lp_build_context *bld, 1094 LLVMValueRef a) 1095 { 1096 LLVMBuilderRef builder = bld->gallivm->builder; 1097 const struct lp_type type = bld->type; 1098 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1099 1100 assert(lp_check_value(type, a)); 1101 1102 if(!type.sign) 1103 return a; 1104 1105 if(type.floating) { 1106 /* Mask out the sign bit */ 1107 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1108 unsigned long long absMask = ~(1ULL << (type.width - 1)); 1109 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 1110 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1111 a = LLVMBuildAnd(builder, a, mask, ""); 1112 a = LLVMBuildBitCast(builder, a, vec_type, ""); 1113 return a; 1114 } 1115 1116 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { 1117 switch(type.width) { 1118 case 8: 1119 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1120 case 16: 1121 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1122 case 32: 1123 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1124 } 1125 } 1126 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 && 1127 (gallivm_debug & GALLIVM_DEBUG_PERF) && 1128 (type.width == 8 || type.width == 16 || type.width == 32)) { 1129 debug_printf("%s: inefficient code, should split vectors manually\n", 1130 __FUNCTION__); 1131 } 1132 1133 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); 1134 } 1135 1136 1137 LLVMValueRef 1138 lp_build_negate(struct lp_build_context *bld, 1139 LLVMValueRef a) 1140 { 1141 LLVMBuilderRef builder = bld->gallivm->builder; 1142 1143 assert(lp_check_value(bld->type, a)); 1144 1145 #if HAVE_LLVM >= 0x0207 1146 if (bld->type.floating) 1147 a = LLVMBuildFNeg(builder, a, ""); 1148 else 1149 #endif 1150 a = LLVMBuildNeg(builder, a, ""); 1151 1152 return a; 1153 } 1154 1155 1156 /** Return -1, 0 or +1 depending on the sign of a */ 1157 LLVMValueRef 1158 lp_build_sgn(struct lp_build_context *bld, 1159 LLVMValueRef a) 1160 { 1161 LLVMBuilderRef builder = bld->gallivm->builder; 1162 const struct lp_type type = bld->type; 1163 LLVMValueRef cond; 1164 LLVMValueRef res; 1165 1166 assert(lp_check_value(type, a)); 1167 1168 /* Handle non-zero case */ 1169 if(!type.sign) { 1170 /* if not zero then sign must be positive */ 1171 res = bld->one; 1172 } 1173 else if(type.floating) { 1174 LLVMTypeRef vec_type; 1175 LLVMTypeRef int_type; 1176 LLVMValueRef mask; 1177 LLVMValueRef sign; 1178 LLVMValueRef one; 1179 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1180 1181 int_type = lp_build_int_vec_type(bld->gallivm, type); 1182 vec_type = lp_build_vec_type(bld->gallivm, type); 1183 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1184 1185 /* Take the sign bit and add it to 1 constant */ 1186 sign = LLVMBuildBitCast(builder, a, int_type, ""); 1187 sign = LLVMBuildAnd(builder, sign, mask, ""); 1188 one = LLVMConstBitCast(bld->one, int_type); 1189 res = LLVMBuildOr(builder, sign, one, ""); 1190 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1191 } 1192 else 1193 { 1194 /* signed int/norm/fixed point */ 1195 /* could use psign with sse3 and appropriate vectors here */ 1196 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1197 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1198 res = lp_build_select(bld, cond, bld->one, minus_one); 1199 } 1200 1201 /* Handle zero */ 1202 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1203 res = lp_build_select(bld, cond, bld->zero, res); 1204 1205 return res; 1206 } 1207 1208 1209 /** 1210 * Set the sign of float vector 'a' according to 'sign'. 1211 * If sign==0, return abs(a). 1212 * If sign==1, return -abs(a); 1213 * Other values for sign produce undefined results. 1214 */ 1215 LLVMValueRef 1216 lp_build_set_sign(struct lp_build_context *bld, 1217 LLVMValueRef a, LLVMValueRef sign) 1218 { 1219 LLVMBuilderRef builder = bld->gallivm->builder; 1220 const struct lp_type type = bld->type; 1221 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1222 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1223 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1224 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1225 ~((unsigned long long) 1 << (type.width - 1))); 1226 LLVMValueRef val, res; 1227 1228 assert(type.floating); 1229 assert(lp_check_value(type, a)); 1230 1231 /* val = reinterpret_cast<int>(a) */ 1232 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1233 /* val = val & mask */ 1234 val = LLVMBuildAnd(builder, val, mask, ""); 1235 /* sign = sign << shift */ 1236 sign = LLVMBuildShl(builder, sign, shift, ""); 1237 /* res = val | sign */ 1238 res = LLVMBuildOr(builder, val, sign, ""); 1239 /* res = reinterpret_cast<float>(res) */ 1240 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1241 1242 return res; 1243 } 1244 1245 1246 /** 1247 * Convert vector of (or scalar) int to vector of (or scalar) float. 1248 */ 1249 LLVMValueRef 1250 lp_build_int_to_float(struct lp_build_context *bld, 1251 LLVMValueRef a) 1252 { 1253 LLVMBuilderRef builder = bld->gallivm->builder; 1254 const struct lp_type type = bld->type; 1255 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1256 1257 assert(type.floating); 1258 1259 return LLVMBuildSIToFP(builder, a, vec_type, ""); 1260 } 1261 1262 static boolean 1263 sse41_rounding_available(const struct lp_type type) 1264 { 1265 if ((util_cpu_caps.has_sse4_1 && 1266 (type.length == 1 || type.width*type.length == 128)) || 1267 (util_cpu_caps.has_avx && type.width*type.length == 256)) 1268 return TRUE; 1269 1270 return FALSE; 1271 } 1272 1273 enum lp_build_round_sse41_mode 1274 { 1275 LP_BUILD_ROUND_SSE41_NEAREST = 0, 1276 LP_BUILD_ROUND_SSE41_FLOOR = 1, 1277 LP_BUILD_ROUND_SSE41_CEIL = 2, 1278 LP_BUILD_ROUND_SSE41_TRUNCATE = 3 1279 }; 1280 1281 1282 /** 1283 * Helper for SSE4.1's ROUNDxx instructions. 1284 * 1285 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the 1286 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. 1287 */ 1288 static INLINE LLVMValueRef 1289 lp_build_round_sse41(struct lp_build_context *bld, 1290 LLVMValueRef a, 1291 enum lp_build_round_sse41_mode mode) 1292 { 1293 LLVMBuilderRef builder = bld->gallivm->builder; 1294 const struct lp_type type = bld->type; 1295 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1296 const char *intrinsic; 1297 LLVMValueRef res; 1298 1299 assert(type.floating); 1300 1301 assert(lp_check_value(type, a)); 1302 assert(util_cpu_caps.has_sse4_1); 1303 1304 if (type.length == 1) { 1305 LLVMTypeRef vec_type; 1306 LLVMValueRef undef; 1307 LLVMValueRef args[3]; 1308 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1309 1310 switch(type.width) { 1311 case 32: 1312 intrinsic = "llvm.x86.sse41.round.ss"; 1313 break; 1314 case 64: 1315 intrinsic = "llvm.x86.sse41.round.sd"; 1316 break; 1317 default: 1318 assert(0); 1319 return bld->undef; 1320 } 1321 1322 vec_type = LLVMVectorType(bld->elem_type, 4); 1323 1324 undef = LLVMGetUndef(vec_type); 1325 1326 args[0] = undef; 1327 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1328 args[2] = LLVMConstInt(i32t, mode, 0); 1329 1330 res = lp_build_intrinsic(builder, intrinsic, 1331 vec_type, args, Elements(args)); 1332 1333 res = LLVMBuildExtractElement(builder, res, index0, ""); 1334 } 1335 else { 1336 if (type.width * type.length == 128) { 1337 switch(type.width) { 1338 case 32: 1339 intrinsic = "llvm.x86.sse41.round.ps"; 1340 break; 1341 case 64: 1342 intrinsic = "llvm.x86.sse41.round.pd"; 1343 break; 1344 default: 1345 assert(0); 1346 return bld->undef; 1347 } 1348 } 1349 else { 1350 assert(type.width * type.length == 256); 1351 assert(util_cpu_caps.has_avx); 1352 1353 switch(type.width) { 1354 case 32: 1355 intrinsic = "llvm.x86.avx.round.ps.256"; 1356 break; 1357 case 64: 1358 intrinsic = "llvm.x86.avx.round.pd.256"; 1359 break; 1360 default: 1361 assert(0); 1362 return bld->undef; 1363 } 1364 } 1365 1366 res = lp_build_intrinsic_binary(builder, intrinsic, 1367 bld->vec_type, a, 1368 LLVMConstInt(i32t, mode, 0)); 1369 } 1370 1371 return res; 1372 } 1373 1374 1375 static INLINE LLVMValueRef 1376 lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1377 LLVMValueRef a) 1378 { 1379 LLVMBuilderRef builder = bld->gallivm->builder; 1380 const struct lp_type type = bld->type; 1381 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1382 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1383 const char *intrinsic; 1384 LLVMValueRef res; 1385 1386 assert(type.floating); 1387 /* using the double precision conversions is a bit more complicated */ 1388 assert(type.width == 32); 1389 1390 assert(lp_check_value(type, a)); 1391 assert(util_cpu_caps.has_sse2); 1392 1393 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1394 if (type.length == 1) { 1395 LLVMTypeRef vec_type; 1396 LLVMValueRef undef; 1397 LLVMValueRef arg; 1398 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1399 1400 vec_type = LLVMVectorType(bld->elem_type, 4); 1401 1402 intrinsic = "llvm.x86.sse.cvtss2si"; 1403 1404 undef = LLVMGetUndef(vec_type); 1405 1406 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1407 1408 res = lp_build_intrinsic_unary(builder, intrinsic, 1409 ret_type, arg); 1410 } 1411 else { 1412 if (type.width* type.length == 128) { 1413 intrinsic = "llvm.x86.sse2.cvtps2dq"; 1414 } 1415 else { 1416 assert(type.width*type.length == 256); 1417 assert(util_cpu_caps.has_avx); 1418 1419 intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 1420 } 1421 res = lp_build_intrinsic_unary(builder, intrinsic, 1422 ret_type, a); 1423 } 1424 1425 return res; 1426 } 1427 1428 1429 /** 1430 * Return the integer part of a float (vector) value (== round toward zero). 1431 * The returned value is a float (vector). 1432 * Ex: trunc(-1.5) = -1.0 1433 */ 1434 LLVMValueRef 1435 lp_build_trunc(struct lp_build_context *bld, 1436 LLVMValueRef a) 1437 { 1438 LLVMBuilderRef builder = bld->gallivm->builder; 1439 const struct lp_type type = bld->type; 1440 1441 assert(type.floating); 1442 assert(lp_check_value(type, a)); 1443 1444 if (sse41_rounding_available(type)) { 1445 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); 1446 } 1447 else { 1448 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1449 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1450 LLVMValueRef res; 1451 res = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1452 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1453 return res; 1454 } 1455 } 1456 1457 1458 /** 1459 * Return float (vector) rounded to nearest integer (vector). The returned 1460 * value is a float (vector). 1461 * Ex: round(0.9) = 1.0 1462 * Ex: round(-1.5) = -2.0 1463 */ 1464 LLVMValueRef 1465 lp_build_round(struct lp_build_context *bld, 1466 LLVMValueRef a) 1467 { 1468 LLVMBuilderRef builder = bld->gallivm->builder; 1469 const struct lp_type type = bld->type; 1470 1471 assert(type.floating); 1472 assert(lp_check_value(type, a)); 1473 1474 if (sse41_rounding_available(type)) { 1475 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1476 } 1477 else { 1478 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1479 LLVMValueRef res; 1480 res = lp_build_iround(bld, a); 1481 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1482 return res; 1483 } 1484 } 1485 1486 1487 /** 1488 * Return floor of float (vector), result is a float (vector) 1489 * Ex: floor(1.1) = 1.0 1490 * Ex: floor(-1.1) = -2.0 1491 */ 1492 LLVMValueRef 1493 lp_build_floor(struct lp_build_context *bld, 1494 LLVMValueRef a) 1495 { 1496 LLVMBuilderRef builder = bld->gallivm->builder; 1497 const struct lp_type type = bld->type; 1498 1499 assert(type.floating); 1500 assert(lp_check_value(type, a)); 1501 1502 if (sse41_rounding_available(type)) { 1503 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1504 } 1505 else { 1506 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1507 LLVMValueRef res; 1508 res = lp_build_ifloor(bld, a); 1509 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1510 return res; 1511 } 1512 } 1513 1514 1515 /** 1516 * Return ceiling of float (vector), returning float (vector). 1517 * Ex: ceil( 1.1) = 2.0 1518 * Ex: ceil(-1.1) = -1.0 1519 */ 1520 LLVMValueRef 1521 lp_build_ceil(struct lp_build_context *bld, 1522 LLVMValueRef a) 1523 { 1524 LLVMBuilderRef builder = bld->gallivm->builder; 1525 const struct lp_type type = bld->type; 1526 1527 assert(type.floating); 1528 assert(lp_check_value(type, a)); 1529 1530 if (sse41_rounding_available(type)) { 1531 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1532 } 1533 else { 1534 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1535 LLVMValueRef res; 1536 res = lp_build_iceil(bld, a); 1537 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 1538 return res; 1539 } 1540 } 1541 1542 1543 /** 1544 * Return fractional part of 'a' computed as a - floor(a) 1545 * Typically used in texture coord arithmetic. 1546 */ 1547 LLVMValueRef 1548 lp_build_fract(struct lp_build_context *bld, 1549 LLVMValueRef a) 1550 { 1551 assert(bld->type.floating); 1552 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 1553 } 1554 1555 1556 /** 1557 * Prevent returning a fractional part of 1.0 for very small negative values of 1558 * 'a' by clamping against 0.99999(9). 1559 */ 1560 static inline LLVMValueRef 1561 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 1562 { 1563 LLVMValueRef max; 1564 1565 /* this is the largest number smaller than 1.0 representable as float */ 1566 max = lp_build_const_vec(bld->gallivm, bld->type, 1567 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 1568 return lp_build_min(bld, fract, max); 1569 } 1570 1571 1572 /** 1573 * Same as lp_build_fract, but guarantees that the result is always smaller 1574 * than one. 1575 */ 1576 LLVMValueRef 1577 lp_build_fract_safe(struct lp_build_context *bld, 1578 LLVMValueRef a) 1579 { 1580 return clamp_fract(bld, lp_build_fract(bld, a)); 1581 } 1582 1583 1584 /** 1585 * Return the integer part of a float (vector) value (== round toward zero). 1586 * The returned value is an integer (vector). 1587 * Ex: itrunc(-1.5) = -1 1588 */ 1589 LLVMValueRef 1590 lp_build_itrunc(struct lp_build_context *bld, 1591 LLVMValueRef a) 1592 { 1593 LLVMBuilderRef builder = bld->gallivm->builder; 1594 const struct lp_type type = bld->type; 1595 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1596 1597 assert(type.floating); 1598 assert(lp_check_value(type, a)); 1599 1600 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 1601 } 1602 1603 1604 /** 1605 * Return float (vector) rounded to nearest integer (vector). The returned 1606 * value is an integer (vector). 1607 * Ex: iround(0.9) = 1 1608 * Ex: iround(-1.5) = -2 1609 */ 1610 LLVMValueRef 1611 lp_build_iround(struct lp_build_context *bld, 1612 LLVMValueRef a) 1613 { 1614 LLVMBuilderRef builder = bld->gallivm->builder; 1615 const struct lp_type type = bld->type; 1616 LLVMTypeRef int_vec_type = bld->int_vec_type; 1617 LLVMValueRef res; 1618 1619 assert(type.floating); 1620 1621 assert(lp_check_value(type, a)); 1622 1623 if ((util_cpu_caps.has_sse2 && 1624 ((type.width == 32) && (type.length == 1 || type.length == 4))) || 1625 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 1626 return lp_build_iround_nearest_sse2(bld, a); 1627 } 1628 if (sse41_rounding_available(type)) { 1629 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); 1630 } 1631 else { 1632 LLVMValueRef half; 1633 1634 half = lp_build_const_vec(bld->gallivm, type, 0.5); 1635 1636 if (type.sign) { 1637 LLVMTypeRef vec_type = bld->vec_type; 1638 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1639 (unsigned long long)1 << (type.width - 1)); 1640 LLVMValueRef sign; 1641 1642 /* get sign bit */ 1643 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1644 sign = LLVMBuildAnd(builder, sign, mask, ""); 1645 1646 /* sign * 0.5 */ 1647 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 1648 half = LLVMBuildOr(builder, sign, half, ""); 1649 half = LLVMBuildBitCast(builder, half, vec_type, ""); 1650 } 1651 1652 res = LLVMBuildFAdd(builder, a, half, ""); 1653 } 1654 1655 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 1656 1657 return res; 1658 } 1659 1660 1661 /** 1662 * Return floor of float (vector), result is an int (vector) 1663 * Ex: ifloor(1.1) = 1.0 1664 * Ex: ifloor(-1.1) = -2.0 1665 */ 1666 LLVMValueRef 1667 lp_build_ifloor(struct lp_build_context *bld, 1668 LLVMValueRef a) 1669 { 1670 LLVMBuilderRef builder = bld->gallivm->builder; 1671 const struct lp_type type = bld->type; 1672 LLVMTypeRef int_vec_type = bld->int_vec_type; 1673 LLVMValueRef res; 1674 1675 assert(type.floating); 1676 assert(lp_check_value(type, a)); 1677 1678 res = a; 1679 if (type.sign) { 1680 if (sse41_rounding_available(type)) { 1681 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); 1682 } 1683 else { 1684 /* Take the sign bit and add it to 1 constant */ 1685 LLVMTypeRef vec_type = bld->vec_type; 1686 unsigned mantissa = lp_mantissa(type); 1687 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1688 (unsigned long long)1 << (type.width - 1)); 1689 LLVMValueRef sign; 1690 LLVMValueRef offset; 1691 1692 /* sign = a < 0 ? ~0 : 0 */ 1693 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1694 sign = LLVMBuildAnd(builder, sign, mask, ""); 1695 sign = LLVMBuildAShr(builder, sign, 1696 lp_build_const_int_vec(bld->gallivm, type, 1697 type.width - 1), 1698 "ifloor.sign"); 1699 1700 /* offset = -0.99999(9)f */ 1701 offset = lp_build_const_vec(bld->gallivm, type, 1702 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1703 offset = LLVMConstBitCast(offset, int_vec_type); 1704 1705 /* offset = a < 0 ? offset : 0.0f */ 1706 offset = LLVMBuildAnd(builder, offset, sign, ""); 1707 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset"); 1708 1709 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res"); 1710 } 1711 } 1712 1713 /* round to nearest (toward zero) */ 1714 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 1715 1716 return res; 1717 } 1718 1719 1720 /** 1721 * Return ceiling of float (vector), returning int (vector). 1722 * Ex: iceil( 1.1) = 2 1723 * Ex: iceil(-1.1) = -1 1724 */ 1725 LLVMValueRef 1726 lp_build_iceil(struct lp_build_context *bld, 1727 LLVMValueRef a) 1728 { 1729 LLVMBuilderRef builder = bld->gallivm->builder; 1730 const struct lp_type type = bld->type; 1731 LLVMTypeRef int_vec_type = bld->int_vec_type; 1732 LLVMValueRef res; 1733 1734 assert(type.floating); 1735 assert(lp_check_value(type, a)); 1736 1737 if (sse41_rounding_available(type)) { 1738 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); 1739 } 1740 else { 1741 LLVMTypeRef vec_type = bld->vec_type; 1742 unsigned mantissa = lp_mantissa(type); 1743 LLVMValueRef offset; 1744 1745 /* offset = 0.99999(9)f */ 1746 offset = lp_build_const_vec(bld->gallivm, type, 1747 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); 1748 1749 if (type.sign) { 1750 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1751 (unsigned long long)1 << (type.width - 1)); 1752 LLVMValueRef sign; 1753 1754 /* sign = a < 0 ? 0 : ~0 */ 1755 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1756 sign = LLVMBuildAnd(builder, sign, mask, ""); 1757 sign = LLVMBuildAShr(builder, sign, 1758 lp_build_const_int_vec(bld->gallivm, type, 1759 type.width - 1), 1760 "iceil.sign"); 1761 sign = LLVMBuildNot(builder, sign, "iceil.not"); 1762 1763 /* offset = a < 0 ? 0.0 : offset */ 1764 offset = LLVMConstBitCast(offset, int_vec_type); 1765 offset = LLVMBuildAnd(builder, offset, sign, ""); 1766 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset"); 1767 } 1768 1769 res = LLVMBuildFAdd(builder, a, offset, "iceil.res"); 1770 } 1771 1772 /* round to nearest (toward zero) */ 1773 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 1774 1775 return res; 1776 } 1777 1778 1779 /** 1780 * Combined ifloor() & fract(). 1781 * 1782 * Preferred to calling the functions separately, as it will ensure that the 1783 * strategy (floor() vs ifloor()) that results in less redundant work is used. 1784 */ 1785 void 1786 lp_build_ifloor_fract(struct lp_build_context *bld, 1787 LLVMValueRef a, 1788 LLVMValueRef *out_ipart, 1789 LLVMValueRef *out_fpart) 1790 { 1791 LLVMBuilderRef builder = bld->gallivm->builder; 1792 const struct lp_type type = bld->type; 1793 LLVMValueRef ipart; 1794 1795 assert(type.floating); 1796 assert(lp_check_value(type, a)); 1797 1798 if (sse41_rounding_available(type)) { 1799 /* 1800 * floor() is easier. 1801 */ 1802 1803 ipart = lp_build_floor(bld, a); 1804 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1805 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 1806 } 1807 else { 1808 /* 1809 * ifloor() is easier. 1810 */ 1811 1812 *out_ipart = lp_build_ifloor(bld, a); 1813 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 1814 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 1815 } 1816 } 1817 1818 1819 /** 1820 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 1821 * always smaller than one. 1822 */ 1823 void 1824 lp_build_ifloor_fract_safe(struct lp_build_context *bld, 1825 LLVMValueRef a, 1826 LLVMValueRef *out_ipart, 1827 LLVMValueRef *out_fpart) 1828 { 1829 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 1830 *out_fpart = clamp_fract(bld, *out_fpart); 1831 } 1832 1833 1834 LLVMValueRef 1835 lp_build_sqrt(struct lp_build_context *bld, 1836 LLVMValueRef a) 1837 { 1838 LLVMBuilderRef builder = bld->gallivm->builder; 1839 const struct lp_type type = bld->type; 1840 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1841 char intrinsic[32]; 1842 1843 assert(lp_check_value(type, a)); 1844 1845 /* TODO: optimize the constant case */ 1846 1847 assert(type.floating); 1848 if (type.length == 1) { 1849 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width); 1850 } 1851 else { 1852 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); 1853 } 1854 1855 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1856 } 1857 1858 1859 /** 1860 * Do one Newton-Raphson step to improve reciprocate precision: 1861 * 1862 * x_{i+1} = x_i * (2 - a * x_i) 1863 * 1864 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 1865 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 1866 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 1867 * halo. It would be necessary to clamp the argument to prevent this. 1868 * 1869 * See also: 1870 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 1871 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1872 */ 1873 static INLINE LLVMValueRef 1874 lp_build_rcp_refine(struct lp_build_context *bld, 1875 LLVMValueRef a, 1876 LLVMValueRef rcp_a) 1877 { 1878 LLVMBuilderRef builder = bld->gallivm->builder; 1879 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 1880 LLVMValueRef res; 1881 1882 res = LLVMBuildFMul(builder, a, rcp_a, ""); 1883 res = LLVMBuildFSub(builder, two, res, ""); 1884 res = LLVMBuildFMul(builder, rcp_a, res, ""); 1885 1886 return res; 1887 } 1888 1889 1890 LLVMValueRef 1891 lp_build_rcp(struct lp_build_context *bld, 1892 LLVMValueRef a) 1893 { 1894 LLVMBuilderRef builder = bld->gallivm->builder; 1895 const struct lp_type type = bld->type; 1896 1897 assert(lp_check_value(type, a)); 1898 1899 if(a == bld->zero) 1900 return bld->undef; 1901 if(a == bld->one) 1902 return bld->one; 1903 if(a == bld->undef) 1904 return bld->undef; 1905 1906 assert(type.floating); 1907 1908 if(LLVMIsConstant(a)) 1909 return LLVMConstFDiv(bld->one, a); 1910 1911 /* 1912 * We don't use RCPPS because: 1913 * - it only has 10bits of precision 1914 * - it doesn't even get the reciprocate of 1.0 exactly 1915 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 1916 * - for recent processors the benefit over DIVPS is marginal, a case 1917 * dependent 1918 * 1919 * We could still use it on certain processors if benchmarks show that the 1920 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 1921 * particular uses that require less workarounds. 1922 */ 1923 1924 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1925 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ 1926 const unsigned num_iterations = 0; 1927 LLVMValueRef res; 1928 unsigned i; 1929 const char *intrinsic = NULL; 1930 1931 if (type.length == 4) { 1932 intrinsic = "llvm.x86.sse.rcp.ps"; 1933 } 1934 else { 1935 intrinsic = "llvm.x86.avx.rcp.ps.256"; 1936 } 1937 1938 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 1939 1940 for (i = 0; i < num_iterations; ++i) { 1941 res = lp_build_rcp_refine(bld, a, res); 1942 } 1943 1944 return res; 1945 } 1946 1947 return LLVMBuildFDiv(builder, bld->one, a, ""); 1948 } 1949 1950 1951 /** 1952 * Do one Newton-Raphson step to improve rsqrt precision: 1953 * 1954 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 1955 * 1956 * See also: 1957 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 1958 */ 1959 static INLINE LLVMValueRef 1960 lp_build_rsqrt_refine(struct lp_build_context *bld, 1961 LLVMValueRef a, 1962 LLVMValueRef rsqrt_a) 1963 { 1964 LLVMBuilderRef builder = bld->gallivm->builder; 1965 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 1966 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 1967 LLVMValueRef res; 1968 1969 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 1970 res = LLVMBuildFMul(builder, a, res, ""); 1971 res = LLVMBuildFSub(builder, three, res, ""); 1972 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 1973 res = LLVMBuildFMul(builder, half, res, ""); 1974 1975 return res; 1976 } 1977 1978 1979 /** 1980 * Generate 1/sqrt(a) 1981 */ 1982 LLVMValueRef 1983 lp_build_rsqrt(struct lp_build_context *bld, 1984 LLVMValueRef a) 1985 { 1986 LLVMBuilderRef builder = bld->gallivm->builder; 1987 const struct lp_type type = bld->type; 1988 1989 assert(lp_check_value(type, a)); 1990 1991 assert(type.floating); 1992 1993 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1994 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 1995 const unsigned num_iterations = 1; 1996 LLVMValueRef res; 1997 unsigned i; 1998 const char *intrinsic = NULL; 1999 2000 if (type.length == 4) { 2001 intrinsic = "llvm.x86.sse.rsqrt.ps"; 2002 } 2003 else { 2004 intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2005 } 2006 2007 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2008 2009 2010 for (i = 0; i < num_iterations; ++i) { 2011 res = lp_build_rsqrt_refine(bld, a, res); 2012 } 2013 2014 return res; 2015 } 2016 2017 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2018 } 2019 2020 2021 /** 2022 * Generate sin(a) using SSE2 2023 */ 2024 LLVMValueRef 2025 lp_build_sin(struct lp_build_context *bld, 2026 LLVMValueRef a) 2027 { 2028 struct gallivm_state *gallivm = bld->gallivm; 2029 LLVMBuilderRef builder = gallivm->builder; 2030 struct lp_type int_type = lp_int_type(bld->type); 2031 LLVMBuilderRef b = builder; 2032 2033 /* 2034 * take the absolute value, 2035 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2036 */ 2037 2038 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2039 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2040 2041 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2042 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2043 2044 /* 2045 * extract the sign bit (upper one) 2046 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 2047 */ 2048 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2049 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); 2050 2051 /* 2052 * scale by 4/Pi 2053 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2054 */ 2055 2056 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2057 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2058 2059 /* 2060 * store the integer part of y in mm0 2061 * emm2 = _mm_cvttps_epi32(y); 2062 */ 2063 2064 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2065 2066 /* 2067 * j=(j+1) & (~1) (see the cephes sources) 2068 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2069 */ 2070 2071 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2072 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2073 /* 2074 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2075 */ 2076 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2077 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2078 2079 /* 2080 * y = _mm_cvtepi32_ps(emm2); 2081 */ 2082 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2083 2084 /* get the swap sign flag 2085 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 2086 */ 2087 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2088 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); 2089 2090 /* 2091 * emm2 = _mm_slli_epi32(emm0, 29); 2092 */ 2093 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2094 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); 2095 2096 /* 2097 * get the polynom selection mask 2098 * there is one polynom for 0 <= x <= Pi/4 2099 * and another one for Pi/4<x<=Pi/2 2100 * Both branches will be computed. 2101 * 2102 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2103 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2104 */ 2105 2106 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2107 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); 2108 LLVMValueRef poly_mask = lp_build_compare(gallivm, 2109 int_type, PIPE_FUNC_EQUAL, 2110 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2111 /* 2112 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); 2113 */ 2114 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); 2115 2116 /* 2117 * _PS_CONST(minus_cephes_DP1, -0.78515625); 2118 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2119 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2120 */ 2121 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2122 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2123 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2124 2125 /* 2126 * The magic pass: "Extended precision modular arithmetic" 2127 * x = ((x - y * DP1) - y * DP2) - y * DP3; 2128 * xmm1 = _mm_mul_ps(y, xmm1); 2129 * xmm2 = _mm_mul_ps(y, xmm2); 2130 * xmm3 = _mm_mul_ps(y, xmm3); 2131 */ 2132 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 2133 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 2134 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 2135 2136 /* 2137 * x = _mm_add_ps(x, xmm1); 2138 * x = _mm_add_ps(x, xmm2); 2139 * x = _mm_add_ps(x, xmm3); 2140 */ 2141 2142 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2143 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2144 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2145 2146 /* 2147 * Evaluate the first polynom (0 <= x <= Pi/4) 2148 * 2149 * z = _mm_mul_ps(x,x); 2150 */ 2151 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2152 2153 /* 2154 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2155 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2156 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2157 */ 2158 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2159 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2160 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2161 2162 /* 2163 * y = *(v4sf*)_ps_coscof_p0; 2164 * y = _mm_mul_ps(y, z); 2165 */ 2166 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2167 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2168 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2169 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2170 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2171 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2172 2173 2174 /* 2175 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2176 * y = _mm_sub_ps(y, tmp); 2177 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2178 */ 2179 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2180 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2181 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2182 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2183 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2184 2185 /* 2186 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2187 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2188 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2189 */ 2190 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2191 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2192 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2193 2194 /* 2195 * Evaluate the second polynom (Pi/4 <= x <= 0) 2196 * 2197 * y2 = *(v4sf*)_ps_sincof_p0; 2198 * y2 = _mm_mul_ps(y2, z); 2199 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2200 * y2 = _mm_mul_ps(y2, z); 2201 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2202 * y2 = _mm_mul_ps(y2, z); 2203 * y2 = _mm_mul_ps(y2, x); 2204 * y2 = _mm_add_ps(y2, x); 2205 */ 2206 2207 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2208 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2209 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2210 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2211 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2212 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2213 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2214 2215 /* 2216 * select the correct result from the two polynoms 2217 * xmm3 = poly_mask; 2218 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2219 * y = _mm_andnot_ps(xmm3, y); 2220 * y = _mm_add_ps(y,y2); 2221 */ 2222 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2223 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2224 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2225 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 2226 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2227 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2228 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2229 2230 /* 2231 * update the sign 2232 * y = _mm_xor_ps(y, sign_bit); 2233 */ 2234 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); 2235 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2236 return y_result; 2237 } 2238 2239 2240 /** 2241 * Generate cos(a) using SSE2 2242 */ 2243 LLVMValueRef 2244 lp_build_cos(struct lp_build_context *bld, 2245 LLVMValueRef a) 2246 { 2247 struct gallivm_state *gallivm = bld->gallivm; 2248 LLVMBuilderRef builder = gallivm->builder; 2249 struct lp_type int_type = lp_int_type(bld->type); 2250 LLVMBuilderRef b = builder; 2251 2252 /* 2253 * take the absolute value, 2254 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2255 */ 2256 2257 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2258 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2259 2260 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2261 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2262 2263 /* 2264 * scale by 4/Pi 2265 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2266 */ 2267 2268 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2269 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2270 2271 /* 2272 * store the integer part of y in mm0 2273 * emm2 = _mm_cvttps_epi32(y); 2274 */ 2275 2276 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2277 2278 /* 2279 * j=(j+1) & (~1) (see the cephes sources) 2280 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2281 */ 2282 2283 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2284 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2285 /* 2286 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2287 */ 2288 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2289 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2290 2291 /* 2292 * y = _mm_cvtepi32_ps(emm2); 2293 */ 2294 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2295 2296 2297 /* 2298 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); 2299 */ 2300 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2301 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); 2302 2303 2304 /* get the swap sign flag 2305 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); 2306 */ 2307 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0); 2308 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); 2309 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2310 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); 2311 2312 /* 2313 * emm2 = _mm_slli_epi32(emm0, 29); 2314 */ 2315 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2316 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); 2317 2318 /* 2319 * get the polynom selection mask 2320 * there is one polynom for 0 <= x <= Pi/4 2321 * and another one for Pi/4<x<=Pi/2 2322 * Both branches will be computed. 2323 * 2324 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2325 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2326 */ 2327 2328 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2329 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); 2330 LLVMValueRef poly_mask = lp_build_compare(gallivm, 2331 int_type, PIPE_FUNC_EQUAL, 2332 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2333 2334 /* 2335 * _PS_CONST(minus_cephes_DP1, -0.78515625); 2336 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2337 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2338 */ 2339 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2340 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2341 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2342 2343 /* 2344 * The magic pass: "Extended precision modular arithmetic" 2345 * x = ((x - y * DP1) - y * DP2) - y * DP3; 2346 * xmm1 = _mm_mul_ps(y, xmm1); 2347 * xmm2 = _mm_mul_ps(y, xmm2); 2348 * xmm3 = _mm_mul_ps(y, xmm3); 2349 */ 2350 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); 2351 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); 2352 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); 2353 2354 /* 2355 * x = _mm_add_ps(x, xmm1); 2356 * x = _mm_add_ps(x, xmm2); 2357 * x = _mm_add_ps(x, xmm3); 2358 */ 2359 2360 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); 2361 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); 2362 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); 2363 2364 /* 2365 * Evaluate the first polynom (0 <= x <= Pi/4) 2366 * 2367 * z = _mm_mul_ps(x,x); 2368 */ 2369 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2370 2371 /* 2372 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2373 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2374 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2375 */ 2376 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2377 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2378 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2379 2380 /* 2381 * y = *(v4sf*)_ps_coscof_p0; 2382 * y = _mm_mul_ps(y, z); 2383 */ 2384 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); 2385 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); 2386 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); 2387 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); 2388 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2389 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2390 2391 2392 /* 2393 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2394 * y = _mm_sub_ps(y, tmp); 2395 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2396 */ 2397 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2398 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2399 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2400 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2401 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2402 2403 /* 2404 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2405 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2406 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2407 */ 2408 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2409 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2410 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2411 2412 /* 2413 * Evaluate the second polynom (Pi/4 <= x <= 0) 2414 * 2415 * y2 = *(v4sf*)_ps_sincof_p0; 2416 * y2 = _mm_mul_ps(y2, z); 2417 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2418 * y2 = _mm_mul_ps(y2, z); 2419 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2420 * y2 = _mm_mul_ps(y2, z); 2421 * y2 = _mm_mul_ps(y2, x); 2422 * y2 = _mm_add_ps(y2, x); 2423 */ 2424 2425 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); 2426 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); 2427 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); 2428 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); 2429 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2430 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); 2431 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); 2432 2433 /* 2434 * select the correct result from the two polynoms 2435 * xmm3 = poly_mask; 2436 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 2437 * y = _mm_andnot_ps(xmm3, y); 2438 * y = _mm_add_ps(y,y2); 2439 */ 2440 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 2441 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 2442 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 2443 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); 2444 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 2445 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); 2446 2447 /* 2448 * update the sign 2449 * y = _mm_xor_ps(y, sign_bit); 2450 */ 2451 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); 2452 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 2453 return y_result; 2454 } 2455 2456 2457 /** 2458 * Generate pow(x, y) 2459 */ 2460 LLVMValueRef 2461 lp_build_pow(struct lp_build_context *bld, 2462 LLVMValueRef x, 2463 LLVMValueRef y) 2464 { 2465 /* TODO: optimize the constant case */ 2466 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2467 LLVMIsConstant(x) && LLVMIsConstant(y)) { 2468 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2469 __FUNCTION__); 2470 } 2471 2472 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 2473 } 2474 2475 2476 /** 2477 * Generate exp(x) 2478 */ 2479 LLVMValueRef 2480 lp_build_exp(struct lp_build_context *bld, 2481 LLVMValueRef x) 2482 { 2483 /* log2(e) = 1/log(2) */ 2484 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 2485 1.4426950408889634); 2486 2487 assert(lp_check_value(bld->type, x)); 2488 2489 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 2490 } 2491 2492 2493 /** 2494 * Generate log(x) 2495 */ 2496 LLVMValueRef 2497 lp_build_log(struct lp_build_context *bld, 2498 LLVMValueRef x) 2499 { 2500 /* log(2) */ 2501 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 2502 0.69314718055994529); 2503 2504 assert(lp_check_value(bld->type, x)); 2505 2506 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 2507 } 2508 2509 2510 /** 2511 * Generate polynomial. 2512 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 2513 */ 2514 static LLVMValueRef 2515 lp_build_polynomial(struct lp_build_context *bld, 2516 LLVMValueRef x, 2517 const double *coeffs, 2518 unsigned num_coeffs) 2519 { 2520 const struct lp_type type = bld->type; 2521 LLVMValueRef even = NULL, odd = NULL; 2522 LLVMValueRef x2; 2523 unsigned i; 2524 2525 assert(lp_check_value(bld->type, x)); 2526 2527 /* TODO: optimize the constant case */ 2528 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2529 LLVMIsConstant(x)) { 2530 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2531 __FUNCTION__); 2532 } 2533 2534 /* 2535 * Calculate odd and even terms seperately to decrease data dependency 2536 * Ex: 2537 * c[0] + x^2 * c[2] + x^4 * c[4] ... 2538 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 2539 */ 2540 x2 = lp_build_mul(bld, x, x); 2541 2542 for (i = num_coeffs; i--; ) { 2543 LLVMValueRef coeff; 2544 2545 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 2546 2547 if (i % 2 == 0) { 2548 if (even) 2549 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even)); 2550 else 2551 even = coeff; 2552 } else { 2553 if (odd) 2554 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd)); 2555 else 2556 odd = coeff; 2557 } 2558 } 2559 2560 if (odd) 2561 return lp_build_add(bld, lp_build_mul(bld, odd, x), even); 2562 else if (even) 2563 return even; 2564 else 2565 return bld->undef; 2566 } 2567 2568 2569 /** 2570 * Minimax polynomial fit of 2**x, in range [0, 1[ 2571 */ 2572 const double lp_build_exp2_polynomial[] = { 2573 #if EXP_POLY_DEGREE == 5 2574 0.999999925063526176901, 2575 0.693153073200168932794, 2576 0.240153617044375388211, 2577 0.0558263180532956664775, 2578 0.00898934009049466391101, 2579 0.00187757667519147912699 2580 #elif EXP_POLY_DEGREE == 4 2581 1.00000259337069434683, 2582 0.693003834469974940458, 2583 0.24144275689150793076, 2584 0.0520114606103070150235, 2585 0.0135341679161270268764 2586 #elif EXP_POLY_DEGREE == 3 2587 0.999925218562710312959, 2588 0.695833540494823811697, 2589 0.226067155427249155588, 2590 0.0780245226406372992967 2591 #elif EXP_POLY_DEGREE == 2 2592 1.00172476321474503578, 2593 0.657636275736077639316, 2594 0.33718943461968720704 2595 #else 2596 #error 2597 #endif 2598 }; 2599 2600 2601 void 2602 lp_build_exp2_approx(struct lp_build_context *bld, 2603 LLVMValueRef x, 2604 LLVMValueRef *p_exp2_int_part, 2605 LLVMValueRef *p_frac_part, 2606 LLVMValueRef *p_exp2) 2607 { 2608 LLVMBuilderRef builder = bld->gallivm->builder; 2609 const struct lp_type type = bld->type; 2610 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2611 LLVMValueRef ipart = NULL; 2612 LLVMValueRef fpart = NULL; 2613 LLVMValueRef expipart = NULL; 2614 LLVMValueRef expfpart = NULL; 2615 LLVMValueRef res = NULL; 2616 2617 assert(lp_check_value(bld->type, x)); 2618 2619 if(p_exp2_int_part || p_frac_part || p_exp2) { 2620 /* TODO: optimize the constant case */ 2621 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2622 LLVMIsConstant(x)) { 2623 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2624 __FUNCTION__); 2625 } 2626 2627 assert(type.floating && type.width == 32); 2628 2629 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0)); 2630 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999)); 2631 2632 /* ipart = floor(x) */ 2633 /* fpart = x - ipart */ 2634 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 2635 } 2636 2637 if(p_exp2_int_part || p_exp2) { 2638 /* expipart = (float) (1 << ipart) */ 2639 expipart = LLVMBuildAdd(builder, ipart, 2640 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2641 expipart = LLVMBuildShl(builder, expipart, 2642 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2643 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 2644 } 2645 2646 if(p_exp2) { 2647 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 2648 Elements(lp_build_exp2_polynomial)); 2649 2650 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 2651 } 2652 2653 if(p_exp2_int_part) 2654 *p_exp2_int_part = expipart; 2655 2656 if(p_frac_part) 2657 *p_frac_part = fpart; 2658 2659 if(p_exp2) 2660 *p_exp2 = res; 2661 } 2662 2663 2664 LLVMValueRef 2665 lp_build_exp2(struct lp_build_context *bld, 2666 LLVMValueRef x) 2667 { 2668 LLVMValueRef res; 2669 lp_build_exp2_approx(bld, x, NULL, NULL, &res); 2670 return res; 2671 } 2672 2673 2674 /** 2675 * Extract the exponent of a IEEE-754 floating point value. 2676 * 2677 * Optionally apply an integer bias. 2678 * 2679 * Result is an integer value with 2680 * 2681 * ifloor(log2(x)) + bias 2682 */ 2683 LLVMValueRef 2684 lp_build_extract_exponent(struct lp_build_context *bld, 2685 LLVMValueRef x, 2686 int bias) 2687 { 2688 LLVMBuilderRef builder = bld->gallivm->builder; 2689 const struct lp_type type = bld->type; 2690 unsigned mantissa = lp_mantissa(type); 2691 LLVMValueRef res; 2692 2693 assert(type.floating); 2694 2695 assert(lp_check_value(bld->type, x)); 2696 2697 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2698 2699 res = LLVMBuildLShr(builder, x, 2700 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 2701 res = LLVMBuildAnd(builder, res, 2702 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 2703 res = LLVMBuildSub(builder, res, 2704 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 2705 2706 return res; 2707 } 2708 2709 2710 /** 2711 * Extract the mantissa of the a floating. 2712 * 2713 * Result is a floating point value with 2714 * 2715 * x / floor(log2(x)) 2716 */ 2717 LLVMValueRef 2718 lp_build_extract_mantissa(struct lp_build_context *bld, 2719 LLVMValueRef x) 2720 { 2721 LLVMBuilderRef builder = bld->gallivm->builder; 2722 const struct lp_type type = bld->type; 2723 unsigned mantissa = lp_mantissa(type); 2724 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 2725 (1ULL << mantissa) - 1); 2726 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 2727 LLVMValueRef res; 2728 2729 assert(lp_check_value(bld->type, x)); 2730 2731 assert(type.floating); 2732 2733 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 2734 2735 /* res = x / 2**ipart */ 2736 res = LLVMBuildAnd(builder, x, mantmask, ""); 2737 res = LLVMBuildOr(builder, res, one, ""); 2738 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 2739 2740 return res; 2741 } 2742 2743 2744 2745 /** 2746 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 2747 * These coefficients can be generate with 2748 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 2749 */ 2750 const double lp_build_log2_polynomial[] = { 2751 #if LOG_POLY_DEGREE == 5 2752 2.88539008148777786488L, 2753 0.961796878841293367824L, 2754 0.577058946784739859012L, 2755 0.412914355135828735411L, 2756 0.308591899232910175289L, 2757 0.352376952300281371868L, 2758 #elif LOG_POLY_DEGREE == 4 2759 2.88539009343309178325L, 2760 0.961791550404184197881L, 2761 0.577440339438736392009L, 2762 0.403343858251329912514L, 2763 0.406718052498846252698L, 2764 #elif LOG_POLY_DEGREE == 3 2765 2.88538959748872753838L, 2766 0.961932915889597772928L, 2767 0.571118517972136195241L, 2768 0.493997535084709500285L, 2769 #else 2770 #error 2771 #endif 2772 }; 2773 2774 /** 2775 * See http://www.devmaster.net/forums/showthread.php?p=43580 2776 * http://en.wikipedia.org/wiki/Logarithm#Calculation 2777 * http://www.nezumi.demon.co.uk/consult/logx.htm 2778 */ 2779 void 2780 lp_build_log2_approx(struct lp_build_context *bld, 2781 LLVMValueRef x, 2782 LLVMValueRef *p_exp, 2783 LLVMValueRef *p_floor_log2, 2784 LLVMValueRef *p_log2) 2785 { 2786 LLVMBuilderRef builder = bld->gallivm->builder; 2787 const struct lp_type type = bld->type; 2788 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2789 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2790 2791 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 2792 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 2793 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 2794 2795 LLVMValueRef i = NULL; 2796 LLVMValueRef y = NULL; 2797 LLVMValueRef z = NULL; 2798 LLVMValueRef exp = NULL; 2799 LLVMValueRef mant = NULL; 2800 LLVMValueRef logexp = NULL; 2801 LLVMValueRef logmant = NULL; 2802 LLVMValueRef res = NULL; 2803 2804 assert(lp_check_value(bld->type, x)); 2805 2806 if(p_exp || p_floor_log2 || p_log2) { 2807 /* TODO: optimize the constant case */ 2808 if (gallivm_debug & GALLIVM_DEBUG_PERF && 2809 LLVMIsConstant(x)) { 2810 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 2811 __FUNCTION__); 2812 } 2813 2814 assert(type.floating && type.width == 32); 2815 2816 /* 2817 * We don't explicitly handle denormalized numbers. They will yield a 2818 * result in the neighbourhood of -127, which appears to be adequate 2819 * enough. 2820 */ 2821 2822 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 2823 2824 /* exp = (float) exponent(x) */ 2825 exp = LLVMBuildAnd(builder, i, expmask, ""); 2826 } 2827 2828 if(p_floor_log2 || p_log2) { 2829 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 2830 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 2831 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 2832 } 2833 2834 if(p_log2) { 2835 /* mant = 1 + (float) mantissa(x) */ 2836 mant = LLVMBuildAnd(builder, i, mantmask, ""); 2837 mant = LLVMBuildOr(builder, mant, one, ""); 2838 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 2839 2840 /* y = (mant - 1) / (mant + 1) */ 2841 y = lp_build_div(bld, 2842 lp_build_sub(bld, mant, bld->one), 2843 lp_build_add(bld, mant, bld->one) 2844 ); 2845 2846 /* z = y^2 */ 2847 z = lp_build_mul(bld, y, y); 2848 2849 /* compute P(z) */ 2850 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 2851 Elements(lp_build_log2_polynomial)); 2852 2853 /* logmant = y * P(z) */ 2854 logmant = lp_build_mul(bld, y, logmant); 2855 2856 res = lp_build_add(bld, logmant, logexp); 2857 } 2858 2859 if(p_exp) { 2860 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 2861 *p_exp = exp; 2862 } 2863 2864 if(p_floor_log2) 2865 *p_floor_log2 = logexp; 2866 2867 if(p_log2) 2868 *p_log2 = res; 2869 } 2870 2871 2872 LLVMValueRef 2873 lp_build_log2(struct lp_build_context *bld, 2874 LLVMValueRef x) 2875 { 2876 LLVMValueRef res; 2877 lp_build_log2_approx(bld, x, NULL, NULL, &res); 2878 return res; 2879 } 2880 2881 2882 /** 2883 * Faster (and less accurate) log2. 2884 * 2885 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 2886 * 2887 * Piece-wise linear approximation, with exact results when x is a 2888 * power of two. 2889 * 2890 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 2891 */ 2892 LLVMValueRef 2893 lp_build_fast_log2(struct lp_build_context *bld, 2894 LLVMValueRef x) 2895 { 2896 LLVMBuilderRef builder = bld->gallivm->builder; 2897 LLVMValueRef ipart; 2898 LLVMValueRef fpart; 2899 2900 assert(lp_check_value(bld->type, x)); 2901 2902 assert(bld->type.floating); 2903 2904 /* ipart = floor(log2(x)) - 1 */ 2905 ipart = lp_build_extract_exponent(bld, x, -1); 2906 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 2907 2908 /* fpart = x / 2**ipart */ 2909 fpart = lp_build_extract_mantissa(bld, x); 2910 2911 /* ipart + fpart */ 2912 return LLVMBuildFAdd(builder, ipart, fpart, ""); 2913 } 2914 2915 2916 /** 2917 * Fast implementation of iround(log2(x)). 2918 * 2919 * Not an approximation -- it should give accurate results all the time. 2920 */ 2921 LLVMValueRef 2922 lp_build_ilog2(struct lp_build_context *bld, 2923 LLVMValueRef x) 2924 { 2925 LLVMBuilderRef builder = bld->gallivm->builder; 2926 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 2927 LLVMValueRef ipart; 2928 2929 assert(bld->type.floating); 2930 2931 assert(lp_check_value(bld->type, x)); 2932 2933 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 2934 x = LLVMBuildFMul(builder, x, sqrt2, ""); 2935 2936 /* ipart = floor(log2(x) + 0.5) */ 2937 ipart = lp_build_extract_exponent(bld, x, 0); 2938 2939 return ipart; 2940 } 2941 2942 LLVMValueRef 2943 lp_build_mod(struct lp_build_context *bld, 2944 LLVMValueRef x, 2945 LLVMValueRef y) 2946 { 2947 LLVMBuilderRef builder = bld->gallivm->builder; 2948 LLVMValueRef res; 2949 const struct lp_type type = bld->type; 2950 2951 assert(lp_check_value(type, x)); 2952 assert(lp_check_value(type, y)); 2953 2954 if (type.floating) 2955 res = LLVMBuildFRem(builder, x, y, ""); 2956 else if (type.sign) 2957 res = LLVMBuildSRem(builder, x, y, ""); 2958 else 2959 res = LLVMBuildURem(builder, x, y, ""); 2960 return res; 2961 } 2962