Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009-2010 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 /**
     30  * @file
     31  * Helper
     32  *
     33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
     34  * notably min/max and saturated operations), and it is often necessary to
     35  * resort machine-specific intrinsics directly. The functions here hide all
     36  * these implementation details from the other modules.
     37  *
     38  * We also do simple expressions simplification here. Reasons are:
     39  * - it is very easy given we have all necessary information readily available
     40  * - LLVM optimization passes fail to simplify several vector expressions
     41  * - We often know value constraints which the optimization passes have no way
     42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
     43  *
     44  * @author Jose Fonseca <jfonseca (at) vmware.com>
     45  */
     46 
     47 
     48 #include <float.h>
     49 
     50 #include "util/u_memory.h"
     51 #include "util/u_debug.h"
     52 #include "util/u_math.h"
     53 #include "util/u_cpu_detect.h"
     54 
     55 #include "lp_bld_type.h"
     56 #include "lp_bld_const.h"
     57 #include "lp_bld_init.h"
     58 #include "lp_bld_intr.h"
     59 #include "lp_bld_logic.h"
     60 #include "lp_bld_pack.h"
     61 #include "lp_bld_debug.h"
     62 #include "lp_bld_bitarit.h"
     63 #include "lp_bld_arit.h"
     64 #include "lp_bld_flow.h"
     65 
     66 #if defined(PIPE_ARCH_SSE)
     67 #include <xmmintrin.h>
     68 #endif
     69 
     70 #ifndef _MM_DENORMALS_ZERO_MASK
     71 #define _MM_DENORMALS_ZERO_MASK 0x0040
     72 #endif
     73 
     74 #ifndef _MM_FLUSH_ZERO_MASK
     75 #define _MM_FLUSH_ZERO_MASK 0x8000
     76 #endif
     77 
     78 #define EXP_POLY_DEGREE 5
     79 
     80 #define LOG_POLY_DEGREE 4
     81 
     82 
     83 /**
     84  * Generate min(a, b)
     85  * No checks for special case values of a or b = 1 or 0 are done.
     86  * NaN's are handled according to the behavior specified by the
     87  * nan_behavior argument.
     88  */
     89 static LLVMValueRef
     90 lp_build_min_simple(struct lp_build_context *bld,
     91                     LLVMValueRef a,
     92                     LLVMValueRef b,
     93                     enum gallivm_nan_behavior nan_behavior)
     94 {
     95    const struct lp_type type = bld->type;
     96    const char *intrinsic = NULL;
     97    unsigned intr_size = 0;
     98    LLVMValueRef cond;
     99 
    100    assert(lp_check_value(type, a));
    101    assert(lp_check_value(type, b));
    102 
    103    /* TODO: optimize the constant case */
    104 
    105    if (type.floating && util_cpu_caps.has_sse) {
    106       if (type.width == 32) {
    107          if (type.length == 1) {
    108             intrinsic = "llvm.x86.sse.min.ss";
    109             intr_size = 128;
    110          }
    111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
    112             intrinsic = "llvm.x86.sse.min.ps";
    113             intr_size = 128;
    114          }
    115          else {
    116             intrinsic = "llvm.x86.avx.min.ps.256";
    117             intr_size = 256;
    118          }
    119       }
    120       if (type.width == 64 && util_cpu_caps.has_sse2) {
    121          if (type.length == 1) {
    122             intrinsic = "llvm.x86.sse2.min.sd";
    123             intr_size = 128;
    124          }
    125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
    126             intrinsic = "llvm.x86.sse2.min.pd";
    127             intr_size = 128;
    128          }
    129          else {
    130             intrinsic = "llvm.x86.avx.min.pd.256";
    131             intr_size = 256;
    132          }
    133       }
    134    }
    135    else if (type.floating && util_cpu_caps.has_altivec) {
    136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
    137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
    139                       __FUNCTION__);
    140       }
    141       if (type.width == 32 && type.length == 4) {
    142          intrinsic = "llvm.ppc.altivec.vminfp";
    143          intr_size = 128;
    144       }
    145    } else if (HAVE_LLVM < 0x0309 &&
    146               util_cpu_caps.has_avx2 && type.length > 4) {
    147       intr_size = 256;
    148       switch (type.width) {
    149       case 8:
    150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
    151          break;
    152       case 16:
    153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
    154          break;
    155       case 32:
    156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
    157          break;
    158       }
    159    } else if (HAVE_LLVM < 0x0309 &&
    160               util_cpu_caps.has_sse2 && type.length >= 2) {
    161       intr_size = 128;
    162       if ((type.width == 8 || type.width == 16) &&
    163           (type.width * type.length <= 64) &&
    164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
    165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
    166                       __FUNCTION__);
    167       }
    168       if (type.width == 8 && !type.sign) {
    169          intrinsic = "llvm.x86.sse2.pminu.b";
    170       }
    171       else if (type.width == 16 && type.sign) {
    172          intrinsic = "llvm.x86.sse2.pmins.w";
    173       }
    174       if (util_cpu_caps.has_sse4_1) {
    175          if (type.width == 8 && type.sign) {
    176             intrinsic = "llvm.x86.sse41.pminsb";
    177          }
    178          if (type.width == 16 && !type.sign) {
    179             intrinsic = "llvm.x86.sse41.pminuw";
    180          }
    181          if (type.width == 32 && !type.sign) {
    182             intrinsic = "llvm.x86.sse41.pminud";
    183          }
    184          if (type.width == 32 && type.sign) {
    185             intrinsic = "llvm.x86.sse41.pminsd";
    186          }
    187       }
    188    } else if (util_cpu_caps.has_altivec) {
    189       intr_size = 128;
    190       if (type.width == 8) {
    191          if (!type.sign) {
    192             intrinsic = "llvm.ppc.altivec.vminub";
    193          } else {
    194             intrinsic = "llvm.ppc.altivec.vminsb";
    195          }
    196       } else if (type.width == 16) {
    197          if (!type.sign) {
    198             intrinsic = "llvm.ppc.altivec.vminuh";
    199          } else {
    200             intrinsic = "llvm.ppc.altivec.vminsh";
    201          }
    202       } else if (type.width == 32) {
    203          if (!type.sign) {
    204             intrinsic = "llvm.ppc.altivec.vminuw";
    205          } else {
    206             intrinsic = "llvm.ppc.altivec.vminsw";
    207          }
    208       }
    209    }
    210 
    211    if (intrinsic) {
    212       /* We need to handle nan's for floating point numbers. If one of the
    213        * inputs is nan the other should be returned (required by both D3D10+
    214        * and OpenCL).
    215        * The sse intrinsics return the second operator in case of nan by
    216        * default so we need to special code to handle those.
    217        */
    218       if (util_cpu_caps.has_sse && type.floating &&
    219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
    220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
    221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    222          LLVMValueRef isnan, min;
    223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    224                                                    type,
    225                                                    intr_size, a, b);
    226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
    227             isnan = lp_build_isnan(bld, b);
    228             return lp_build_select(bld, isnan, a, min);
    229          } else {
    230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
    231             isnan = lp_build_isnan(bld, a);
    232             return lp_build_select(bld, isnan, a, min);
    233          }
    234       } else {
    235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    236                                                     type,
    237                                                     intr_size, a, b);
    238       }
    239    }
    240 
    241    if (type.floating) {
    242       switch (nan_behavior) {
    243       case GALLIVM_NAN_RETURN_NAN: {
    244          LLVMValueRef isnan = lp_build_isnan(bld, b);
    245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    247          return lp_build_select(bld, cond, a, b);
    248       }
    249          break;
    250       case GALLIVM_NAN_RETURN_OTHER: {
    251          LLVMValueRef isnan = lp_build_isnan(bld, a);
    252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    254          return lp_build_select(bld, cond, a, b);
    255       }
    256          break;
    257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
    258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
    259          return lp_build_select(bld, cond, a, b);
    260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
    261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
    262          return lp_build_select(bld, cond, b, a);
    263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
    264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    265          return lp_build_select(bld, cond, a, b);
    266          break;
    267       default:
    268          assert(0);
    269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    270          return lp_build_select(bld, cond, a, b);
    271       }
    272    } else {
    273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    274       return lp_build_select(bld, cond, a, b);
    275    }
    276 }
    277 
    278 
    279 LLVMValueRef
    280 lp_build_fmuladd(LLVMBuilderRef builder,
    281                  LLVMValueRef a,
    282                  LLVMValueRef b,
    283                  LLVMValueRef c)
    284 {
    285    LLVMTypeRef type = LLVMTypeOf(a);
    286    assert(type == LLVMTypeOf(b));
    287    assert(type == LLVMTypeOf(c));
    288    if (HAVE_LLVM < 0x0304) {
    289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
    290        * not supported, and instead it falls-back to a C function.
    291        */
    292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
    293    }
    294    char intrinsic[32];
    295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
    296    LLVMValueRef args[] = { a, b, c };
    297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
    298 }
    299 
    300 
    301 /**
    302  * Generate max(a, b)
    303  * No checks for special case values of a or b = 1 or 0 are done.
    304  * NaN's are handled according to the behavior specified by the
    305  * nan_behavior argument.
    306  */
    307 static LLVMValueRef
    308 lp_build_max_simple(struct lp_build_context *bld,
    309                     LLVMValueRef a,
    310                     LLVMValueRef b,
    311                     enum gallivm_nan_behavior nan_behavior)
    312 {
    313    const struct lp_type type = bld->type;
    314    const char *intrinsic = NULL;
    315    unsigned intr_size = 0;
    316    LLVMValueRef cond;
    317 
    318    assert(lp_check_value(type, a));
    319    assert(lp_check_value(type, b));
    320 
    321    /* TODO: optimize the constant case */
    322 
    323    if (type.floating && util_cpu_caps.has_sse) {
    324       if (type.width == 32) {
    325          if (type.length == 1) {
    326             intrinsic = "llvm.x86.sse.max.ss";
    327             intr_size = 128;
    328          }
    329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
    330             intrinsic = "llvm.x86.sse.max.ps";
    331             intr_size = 128;
    332          }
    333          else {
    334             intrinsic = "llvm.x86.avx.max.ps.256";
    335             intr_size = 256;
    336          }
    337       }
    338       if (type.width == 64 && util_cpu_caps.has_sse2) {
    339          if (type.length == 1) {
    340             intrinsic = "llvm.x86.sse2.max.sd";
    341             intr_size = 128;
    342          }
    343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
    344             intrinsic = "llvm.x86.sse2.max.pd";
    345             intr_size = 128;
    346          }
    347          else {
    348             intrinsic = "llvm.x86.avx.max.pd.256";
    349             intr_size = 256;
    350          }
    351       }
    352    }
    353    else if (type.floating && util_cpu_caps.has_altivec) {
    354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
    355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
    357                       __FUNCTION__);
    358       }
    359       if (type.width == 32 || type.length == 4) {
    360          intrinsic = "llvm.ppc.altivec.vmaxfp";
    361          intr_size = 128;
    362       }
    363    } else if (HAVE_LLVM < 0x0309 &&
    364               util_cpu_caps.has_avx2 && type.length > 4) {
    365       intr_size = 256;
    366       switch (type.width) {
    367       case 8:
    368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
    369          break;
    370       case 16:
    371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
    372          break;
    373       case 32:
    374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
    375          break;
    376       }
    377    } else if (HAVE_LLVM < 0x0309 &&
    378               util_cpu_caps.has_sse2 && type.length >= 2) {
    379       intr_size = 128;
    380       if ((type.width == 8 || type.width == 16) &&
    381           (type.width * type.length <= 64) &&
    382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
    383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
    384                       __FUNCTION__);
    385          }
    386       if (type.width == 8 && !type.sign) {
    387          intrinsic = "llvm.x86.sse2.pmaxu.b";
    388          intr_size = 128;
    389       }
    390       else if (type.width == 16 && type.sign) {
    391          intrinsic = "llvm.x86.sse2.pmaxs.w";
    392       }
    393       if (util_cpu_caps.has_sse4_1) {
    394          if (type.width == 8 && type.sign) {
    395             intrinsic = "llvm.x86.sse41.pmaxsb";
    396          }
    397          if (type.width == 16 && !type.sign) {
    398             intrinsic = "llvm.x86.sse41.pmaxuw";
    399          }
    400          if (type.width == 32 && !type.sign) {
    401             intrinsic = "llvm.x86.sse41.pmaxud";
    402         }
    403          if (type.width == 32 && type.sign) {
    404             intrinsic = "llvm.x86.sse41.pmaxsd";
    405          }
    406       }
    407    } else if (util_cpu_caps.has_altivec) {
    408      intr_size = 128;
    409      if (type.width == 8) {
    410        if (!type.sign) {
    411          intrinsic = "llvm.ppc.altivec.vmaxub";
    412        } else {
    413          intrinsic = "llvm.ppc.altivec.vmaxsb";
    414        }
    415      } else if (type.width == 16) {
    416        if (!type.sign) {
    417          intrinsic = "llvm.ppc.altivec.vmaxuh";
    418        } else {
    419          intrinsic = "llvm.ppc.altivec.vmaxsh";
    420        }
    421      } else if (type.width == 32) {
    422        if (!type.sign) {
    423          intrinsic = "llvm.ppc.altivec.vmaxuw";
    424        } else {
    425          intrinsic = "llvm.ppc.altivec.vmaxsw";
    426        }
    427      }
    428    }
    429 
    430    if (intrinsic) {
    431       if (util_cpu_caps.has_sse && type.floating &&
    432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
    433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
    434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    435          LLVMValueRef isnan, max;
    436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    437                                                    type,
    438                                                    intr_size, a, b);
    439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
    440             isnan = lp_build_isnan(bld, b);
    441             return lp_build_select(bld, isnan, a, max);
    442          } else {
    443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
    444             isnan = lp_build_isnan(bld, a);
    445             return lp_build_select(bld, isnan, a, max);
    446          }
    447       } else {
    448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    449                                                     type,
    450                                                     intr_size, a, b);
    451       }
    452    }
    453 
    454    if (type.floating) {
    455       switch (nan_behavior) {
    456       case GALLIVM_NAN_RETURN_NAN: {
    457          LLVMValueRef isnan = lp_build_isnan(bld, b);
    458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    460          return lp_build_select(bld, cond, a, b);
    461       }
    462          break;
    463       case GALLIVM_NAN_RETURN_OTHER: {
    464          LLVMValueRef isnan = lp_build_isnan(bld, a);
    465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    467          return lp_build_select(bld, cond, a, b);
    468       }
    469          break;
    470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
    471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
    472          return lp_build_select(bld, cond, a, b);
    473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
    474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
    475          return lp_build_select(bld, cond, b, a);
    476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
    477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    478          return lp_build_select(bld, cond, a, b);
    479          break;
    480       default:
    481          assert(0);
    482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    483          return lp_build_select(bld, cond, a, b);
    484       }
    485    } else {
    486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    487       return lp_build_select(bld, cond, a, b);
    488    }
    489 }
    490 
    491 
    492 /**
    493  * Generate 1 - a, or ~a depending on bld->type.
    494  */
    495 LLVMValueRef
    496 lp_build_comp(struct lp_build_context *bld,
    497               LLVMValueRef a)
    498 {
    499    LLVMBuilderRef builder = bld->gallivm->builder;
    500    const struct lp_type type = bld->type;
    501 
    502    assert(lp_check_value(type, a));
    503 
    504    if(a == bld->one)
    505       return bld->zero;
    506    if(a == bld->zero)
    507       return bld->one;
    508 
    509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
    510       if(LLVMIsConstant(a))
    511          return LLVMConstNot(a);
    512       else
    513          return LLVMBuildNot(builder, a, "");
    514    }
    515 
    516    if(LLVMIsConstant(a))
    517       if (type.floating)
    518           return LLVMConstFSub(bld->one, a);
    519       else
    520           return LLVMConstSub(bld->one, a);
    521    else
    522       if (type.floating)
    523          return LLVMBuildFSub(builder, bld->one, a, "");
    524       else
    525          return LLVMBuildSub(builder, bld->one, a, "");
    526 }
    527 
    528 
    529 /**
    530  * Generate a + b
    531  */
    532 LLVMValueRef
    533 lp_build_add(struct lp_build_context *bld,
    534              LLVMValueRef a,
    535              LLVMValueRef b)
    536 {
    537    LLVMBuilderRef builder = bld->gallivm->builder;
    538    const struct lp_type type = bld->type;
    539    LLVMValueRef res;
    540 
    541    assert(lp_check_value(type, a));
    542    assert(lp_check_value(type, b));
    543 
    544    if(a == bld->zero)
    545       return b;
    546    if(b == bld->zero)
    547       return a;
    548    if(a == bld->undef || b == bld->undef)
    549       return bld->undef;
    550 
    551    if(bld->type.norm) {
    552       const char *intrinsic = NULL;
    553 
    554       if(a == bld->one || b == bld->one)
    555         return bld->one;
    556 
    557       if (!type.floating && !type.fixed) {
    558          if (type.width * type.length == 128) {
    559             if(util_cpu_caps.has_sse2) {
    560               if(type.width == 8)
    561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
    562               if(type.width == 16)
    563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
    564             } else if (util_cpu_caps.has_altivec) {
    565               if(type.width == 8)
    566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
    567               if(type.width == 16)
    568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
    569             }
    570          }
    571          if (type.width * type.length == 256) {
    572             if(util_cpu_caps.has_avx2) {
    573               if(type.width == 8)
    574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
    575               if(type.width == 16)
    576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
    577             }
    578          }
    579       }
    580 
    581       if (intrinsic)
    582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
    583    }
    584 
    585    if(type.norm && !type.floating && !type.fixed) {
    586       if (type.sign) {
    587          uint64_t sign = (uint64_t)1 << (type.width - 1);
    588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
    589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
    590          /* a_clamp_max is the maximum a for positive b,
    591             a_clamp_min is the minimum a for negative b. */
    592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
    595       } else {
    596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    597       }
    598    }
    599 
    600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
    601       if (type.floating)
    602          res = LLVMConstFAdd(a, b);
    603       else
    604          res = LLVMConstAdd(a, b);
    605    else
    606       if (type.floating)
    607          res = LLVMBuildFAdd(builder, a, b, "");
    608       else
    609          res = LLVMBuildAdd(builder, a, b, "");
    610 
    611    /* clamp to ceiling of 1.0 */
    612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
    613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    614 
    615    /* XXX clamp to floor of -1 or 0??? */
    616 
    617    return res;
    618 }
    619 
    620 
    621 /** Return the scalar sum of the elements of a.
    622  * Should avoid this operation whenever possible.
    623  */
    624 LLVMValueRef
    625 lp_build_horizontal_add(struct lp_build_context *bld,
    626                         LLVMValueRef a)
    627 {
    628    LLVMBuilderRef builder = bld->gallivm->builder;
    629    const struct lp_type type = bld->type;
    630    LLVMValueRef index, res;
    631    unsigned i, length;
    632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
    633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
    634    LLVMValueRef vecres, elem2;
    635 
    636    assert(lp_check_value(type, a));
    637 
    638    if (type.length == 1) {
    639       return a;
    640    }
    641 
    642    assert(!bld->type.norm);
    643 
    644    /*
    645     * for byte vectors can do much better with psadbw.
    646     * Using repeated shuffle/adds here. Note with multiple vectors
    647     * this can be done more efficiently as outlined in the intel
    648     * optimization manual.
    649     * Note: could cause data rearrangement if used with smaller element
    650     * sizes.
    651     */
    652 
    653    vecres = a;
    654    length = type.length / 2;
    655    while (length > 1) {
    656       LLVMValueRef vec1, vec2;
    657       for (i = 0; i < length; i++) {
    658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
    659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
    660       }
    661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
    662                                     LLVMConstVector(shuffles1, length), "");
    663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
    664                                     LLVMConstVector(shuffles2, length), "");
    665       if (type.floating) {
    666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
    667       }
    668       else {
    669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
    670       }
    671       length = length >> 1;
    672    }
    673 
    674    /* always have vector of size 2 here */
    675    assert(length == 1);
    676 
    677    index = lp_build_const_int32(bld->gallivm, 0);
    678    res = LLVMBuildExtractElement(builder, vecres, index, "");
    679    index = lp_build_const_int32(bld->gallivm, 1);
    680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
    681 
    682    if (type.floating)
    683       res = LLVMBuildFAdd(builder, res, elem2, "");
    684     else
    685       res = LLVMBuildAdd(builder, res, elem2, "");
    686 
    687    return res;
    688 }
    689 
    690 /**
    691  * Return the horizontal sums of 4 float vectors as a float4 vector.
    692  * This uses the technique as outlined in Intel Optimization Manual.
    693  */
    694 static LLVMValueRef
    695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
    696                             LLVMValueRef src[4])
    697 {
    698    struct gallivm_state *gallivm = bld->gallivm;
    699    LLVMBuilderRef builder = gallivm->builder;
    700    LLVMValueRef shuffles[4];
    701    LLVMValueRef tmp[4];
    702    LLVMValueRef sumtmp[2], shuftmp[2];
    703 
    704    /* lower half of regs */
    705    shuffles[0] = lp_build_const_int32(gallivm, 0);
    706    shuffles[1] = lp_build_const_int32(gallivm, 1);
    707    shuffles[2] = lp_build_const_int32(gallivm, 4);
    708    shuffles[3] = lp_build_const_int32(gallivm, 5);
    709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
    710                                    LLVMConstVector(shuffles, 4), "");
    711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
    712                                    LLVMConstVector(shuffles, 4), "");
    713 
    714    /* upper half of regs */
    715    shuffles[0] = lp_build_const_int32(gallivm, 2);
    716    shuffles[1] = lp_build_const_int32(gallivm, 3);
    717    shuffles[2] = lp_build_const_int32(gallivm, 6);
    718    shuffles[3] = lp_build_const_int32(gallivm, 7);
    719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
    720                                    LLVMConstVector(shuffles, 4), "");
    721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
    722                                    LLVMConstVector(shuffles, 4), "");
    723 
    724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
    725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
    726 
    727    shuffles[0] = lp_build_const_int32(gallivm, 0);
    728    shuffles[1] = lp_build_const_int32(gallivm, 2);
    729    shuffles[2] = lp_build_const_int32(gallivm, 4);
    730    shuffles[3] = lp_build_const_int32(gallivm, 6);
    731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
    732                                        LLVMConstVector(shuffles, 4), "");
    733 
    734    shuffles[0] = lp_build_const_int32(gallivm, 1);
    735    shuffles[1] = lp_build_const_int32(gallivm, 3);
    736    shuffles[2] = lp_build_const_int32(gallivm, 5);
    737    shuffles[3] = lp_build_const_int32(gallivm, 7);
    738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
    739                                        LLVMConstVector(shuffles, 4), "");
    740 
    741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
    742 }
    743 
    744 
    745 /*
    746  * partially horizontally add 2-4 float vectors with length nx4,
    747  * i.e. only four adjacent values in each vector will be added,
    748  * assuming values are really grouped in 4 which also determines
    749  * output order.
    750  *
    751  * Return a vector of the same length as the initial vectors,
    752  * with the excess elements (if any) being undefined.
    753  * The element order is independent of number of input vectors.
    754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
    755  * the output order thus will be
    756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
    757  */
    758 LLVMValueRef
    759 lp_build_hadd_partial4(struct lp_build_context *bld,
    760                        LLVMValueRef vectors[],
    761                        unsigned num_vecs)
    762 {
    763    struct gallivm_state *gallivm = bld->gallivm;
    764    LLVMBuilderRef builder = gallivm->builder;
    765    LLVMValueRef ret_vec;
    766    LLVMValueRef tmp[4];
    767    const char *intrinsic = NULL;
    768 
    769    assert(num_vecs >= 2 && num_vecs <= 4);
    770    assert(bld->type.floating);
    771 
    772    /* only use this with at least 2 vectors, as it is sort of expensive
    773     * (depending on cpu) and we always need two horizontal adds anyway,
    774     * so a shuffle/add approach might be better.
    775     */
    776 
    777    tmp[0] = vectors[0];
    778    tmp[1] = vectors[1];
    779 
    780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
    781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
    782 
    783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
    784        bld->type.length == 4) {
    785       intrinsic = "llvm.x86.sse3.hadd.ps";
    786    }
    787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
    788             bld->type.length == 8) {
    789       intrinsic = "llvm.x86.avx.hadd.ps.256";
    790    }
    791    if (intrinsic) {
    792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
    793                                        lp_build_vec_type(gallivm, bld->type),
    794                                        tmp[0], tmp[1]);
    795       if (num_vecs > 2) {
    796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
    797                                           lp_build_vec_type(gallivm, bld->type),
    798                                           tmp[2], tmp[3]);
    799       }
    800       else {
    801          tmp[1] = tmp[0];
    802       }
    803       return lp_build_intrinsic_binary(builder, intrinsic,
    804                                        lp_build_vec_type(gallivm, bld->type),
    805                                        tmp[0], tmp[1]);
    806    }
    807 
    808    if (bld->type.length == 4) {
    809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
    810    }
    811    else {
    812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
    813       unsigned j;
    814       unsigned num_iter = bld->type.length / 4;
    815       struct lp_type parttype = bld->type;
    816       parttype.length = 4;
    817       for (j = 0; j < num_iter; j++) {
    818          LLVMValueRef partsrc[4];
    819          unsigned i;
    820          for (i = 0; i < 4; i++) {
    821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
    822          }
    823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
    824       }
    825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
    826    }
    827    return ret_vec;
    828 }
    829 
    830 /**
    831  * Generate a - b
    832  */
    833 LLVMValueRef
    834 lp_build_sub(struct lp_build_context *bld,
    835              LLVMValueRef a,
    836              LLVMValueRef b)
    837 {
    838    LLVMBuilderRef builder = bld->gallivm->builder;
    839    const struct lp_type type = bld->type;
    840    LLVMValueRef res;
    841 
    842    assert(lp_check_value(type, a));
    843    assert(lp_check_value(type, b));
    844 
    845    if(b == bld->zero)
    846       return a;
    847    if(a == bld->undef || b == bld->undef)
    848       return bld->undef;
    849    if(a == b)
    850       return bld->zero;
    851 
    852    if(bld->type.norm) {
    853       const char *intrinsic = NULL;
    854 
    855       if(b == bld->one)
    856         return bld->zero;
    857 
    858       if (!type.floating && !type.fixed) {
    859          if (type.width * type.length == 128) {
    860             if (util_cpu_caps.has_sse2) {
    861               if(type.width == 8)
    862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
    863               if(type.width == 16)
    864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
    865             } else if (util_cpu_caps.has_altivec) {
    866               if(type.width == 8)
    867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
    868               if(type.width == 16)
    869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
    870             }
    871          }
    872          if (type.width * type.length == 256) {
    873             if (util_cpu_caps.has_avx2) {
    874               if(type.width == 8)
    875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
    876               if(type.width == 16)
    877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
    878             }
    879          }
    880       }
    881 
    882       if (intrinsic)
    883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
    884    }
    885 
    886    if(type.norm && !type.floating && !type.fixed) {
    887       if (type.sign) {
    888          uint64_t sign = (uint64_t)1 << (type.width - 1);
    889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
    890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
    891          /* a_clamp_max is the maximum a for negative b,
    892             a_clamp_min is the minimum a for positive b. */
    893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
    896       } else {
    897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    898       }
    899    }
    900 
    901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
    902       if (type.floating)
    903          res = LLVMConstFSub(a, b);
    904       else
    905          res = LLVMConstSub(a, b);
    906    else
    907       if (type.floating)
    908          res = LLVMBuildFSub(builder, a, b, "");
    909       else
    910          res = LLVMBuildSub(builder, a, b, "");
    911 
    912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
    913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    914 
    915    return res;
    916 }
    917 
    918 
    919 
    920 /**
    921  * Normalized multiplication.
    922  *
    923  * There are several approaches for (using 8-bit normalized multiplication as
    924  * an example):
    925  *
    926  * - alpha plus one
    927  *
    928  *     makes the following approximation to the division (Sree)
    929  *
    930  *       a*b/255 ~= (a*(b + 1)) >> 256
    931  *
    932  *     which is the fastest method that satisfies the following OpenGL criteria of
    933  *
    934  *       0*0 = 0 and 255*255 = 255
    935  *
    936  * - geometric series
    937  *
    938  *     takes the geometric series approximation to the division
    939  *
    940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
    941  *
    942  *     in this case just the first two terms to fit in 16bit arithmetic
    943  *
    944  *       t/255 ~= (t + (t >> 8)) >> 8
    945  *
    946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
    947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
    948  *     must be used.
    949  *
    950  * - geometric series plus rounding
    951  *
    952  *     when using a geometric series division instead of truncating the result
    953  *     use roundoff in the approximation (Jim Blinn)
    954  *
    955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
    956  *
    957  *     achieving the exact results.
    958  *
    959  *
    960  *
    961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
    962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
    963  * @sa Michael Herf, The "double blend trick", May 2000,
    964  *     http://www.stereopsis.com/doubleblend.html
    965  */
    966 static LLVMValueRef
    967 lp_build_mul_norm(struct gallivm_state *gallivm,
    968                   struct lp_type wide_type,
    969                   LLVMValueRef a, LLVMValueRef b)
    970 {
    971    LLVMBuilderRef builder = gallivm->builder;
    972    struct lp_build_context bld;
    973    unsigned n;
    974    LLVMValueRef half;
    975    LLVMValueRef ab;
    976 
    977    assert(!wide_type.floating);
    978    assert(lp_check_value(wide_type, a));
    979    assert(lp_check_value(wide_type, b));
    980 
    981    lp_build_context_init(&bld, gallivm, wide_type);
    982 
    983    n = wide_type.width / 2;
    984    if (wide_type.sign) {
    985       --n;
    986    }
    987 
    988    /*
    989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
    990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
    991     */
    992 
    993    /*
    994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
    995     */
    996 
    997    ab = LLVMBuildMul(builder, a, b, "");
    998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
    999 
   1000    /*
   1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
   1002     */
   1003 
   1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
   1005    if (wide_type.sign) {
   1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
   1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
   1008       half = lp_build_select(&bld, sign, minus_half, half);
   1009    }
   1010    ab = LLVMBuildAdd(builder, ab, half, "");
   1011 
   1012    /* Final division */
   1013    ab = lp_build_shr_imm(&bld, ab, n);
   1014 
   1015    return ab;
   1016 }
   1017 
   1018 /**
   1019  * Generate a * b
   1020  */
   1021 LLVMValueRef
   1022 lp_build_mul(struct lp_build_context *bld,
   1023              LLVMValueRef a,
   1024              LLVMValueRef b)
   1025 {
   1026    LLVMBuilderRef builder = bld->gallivm->builder;
   1027    const struct lp_type type = bld->type;
   1028    LLVMValueRef shift;
   1029    LLVMValueRef res;
   1030 
   1031    assert(lp_check_value(type, a));
   1032    assert(lp_check_value(type, b));
   1033 
   1034    if(a == bld->zero)
   1035       return bld->zero;
   1036    if(a == bld->one)
   1037       return b;
   1038    if(b == bld->zero)
   1039       return bld->zero;
   1040    if(b == bld->one)
   1041       return a;
   1042    if(a == bld->undef || b == bld->undef)
   1043       return bld->undef;
   1044 
   1045    if (!type.floating && !type.fixed && type.norm) {
   1046       struct lp_type wide_type = lp_wider_type(type);
   1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
   1048 
   1049       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
   1050       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
   1051 
   1052       /* PMULLW, PSRLW, PADDW */
   1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
   1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
   1055 
   1056       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
   1057 
   1058       return ab;
   1059    }
   1060 
   1061    if(type.fixed)
   1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
   1063    else
   1064       shift = NULL;
   1065 
   1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
   1067       if (type.floating)
   1068          res = LLVMConstFMul(a, b);
   1069       else
   1070          res = LLVMConstMul(a, b);
   1071       if(shift) {
   1072          if(type.sign)
   1073             res = LLVMConstAShr(res, shift);
   1074          else
   1075             res = LLVMConstLShr(res, shift);
   1076       }
   1077    }
   1078    else {
   1079       if (type.floating)
   1080          res = LLVMBuildFMul(builder, a, b, "");
   1081       else
   1082          res = LLVMBuildMul(builder, a, b, "");
   1083       if(shift) {
   1084          if(type.sign)
   1085             res = LLVMBuildAShr(builder, res, shift, "");
   1086          else
   1087             res = LLVMBuildLShr(builder, res, shift, "");
   1088       }
   1089    }
   1090 
   1091    return res;
   1092 }
   1093 
   1094 /*
   1095  * Widening mul, valid for 32x32 bit -> 64bit only.
   1096  * Result is low 32bits, high bits returned in res_hi.
   1097  *
   1098  * Emits code that is meant to be compiled for the host CPU.
   1099  */
   1100 LLVMValueRef
   1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
   1102                          LLVMValueRef a,
   1103                          LLVMValueRef b,
   1104                          LLVMValueRef *res_hi)
   1105 {
   1106    struct gallivm_state *gallivm = bld->gallivm;
   1107    LLVMBuilderRef builder = gallivm->builder;
   1108 
   1109    assert(bld->type.width == 32);
   1110    assert(bld->type.floating == 0);
   1111    assert(bld->type.fixed == 0);
   1112    assert(bld->type.norm == 0);
   1113 
   1114    /*
   1115     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
   1116     * for x86 simd is atrocious (even if the high bits weren't required),
   1117     * trying to handle real 64bit inputs (which of course can't happen due
   1118     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
   1119     * apparently llvm does not recognize this widening mul). This includes 6
   1120     * (instead of 2) pmuludq plus extra adds and shifts
   1121     * The same story applies to signed mul, albeit fixing this requires sse41.
   1122     * https://llvm.org/bugs/show_bug.cgi?id=30845
   1123     * So, whip up our own code, albeit only for length 4 and 8 (which
   1124     * should be good enough)...
   1125     */
   1126    if ((bld->type.length == 4 || bld->type.length == 8) &&
   1127        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
   1128         util_cpu_caps.has_sse4_1)) {
   1129       const char *intrinsic = NULL;
   1130       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
   1131       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
   1132       struct lp_type type_wide = lp_wider_type(bld->type);
   1133       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
   1134       unsigned i;
   1135       for (i = 0; i < bld->type.length; i += 2) {
   1136          shuf[i] = lp_build_const_int32(gallivm, i+1);
   1137          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
   1138       }
   1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
   1140       aeven = a;
   1141       beven = b;
   1142       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
   1143       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
   1144 
   1145       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
   1146          if (bld->type.sign) {
   1147             intrinsic = "llvm.x86.avx2.pmul.dq";
   1148          } else {
   1149             intrinsic = "llvm.x86.avx2.pmulu.dq";
   1150          }
   1151          muleven = lp_build_intrinsic_binary(builder, intrinsic,
   1152                                              wider_type, aeven, beven);
   1153          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
   1154                                             wider_type, aodd, bodd);
   1155       }
   1156       else {
   1157          /* for consistent naming look elsewhere... */
   1158          if (bld->type.sign) {
   1159             intrinsic = "llvm.x86.sse41.pmuldq";
   1160          } else {
   1161             intrinsic = "llvm.x86.sse2.pmulu.dq";
   1162          }
   1163          /*
   1164           * XXX If we only have AVX but not AVX2 this is a pain.
   1165           * lp_build_intrinsic_binary_anylength() can't handle it
   1166           * (due to src and dst type not being identical).
   1167           */
   1168          if (bld->type.length == 8) {
   1169             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
   1170             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
   1171             LLVMValueRef muleven2[2], mulodd2[2];
   1172             struct lp_type type_wide_half = type_wide;
   1173             LLVMTypeRef wtype_half;
   1174             type_wide_half.length = 2;
   1175             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
   1176             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
   1177             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
   1178             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
   1179             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
   1180             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
   1181             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
   1182             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
   1183             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
   1184             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
   1185                                                     wtype_half, aevenlo, bevenlo);
   1186             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
   1187                                                    wtype_half, aoddlo, boddlo);
   1188             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
   1189                                                     wtype_half, aevenhi, bevenhi);
   1190             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
   1191                                                    wtype_half, aoddhi, boddhi);
   1192             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
   1193             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
   1194 
   1195          }
   1196          else {
   1197             muleven = lp_build_intrinsic_binary(builder, intrinsic,
   1198                                                 wider_type, aeven, beven);
   1199             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
   1200                                                wider_type, aodd, bodd);
   1201          }
   1202       }
   1203       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
   1204       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
   1205 
   1206       for (i = 0; i < bld->type.length; i += 2) {
   1207          shuf[i] = lp_build_const_int32(gallivm, i + 1);
   1208          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
   1209       }
   1210       shuf_vec = LLVMConstVector(shuf, bld->type.length);
   1211       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
   1212 
   1213       for (i = 0; i < bld->type.length; i += 2) {
   1214          shuf[i] = lp_build_const_int32(gallivm, i);
   1215          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
   1216       }
   1217       shuf_vec = LLVMConstVector(shuf, bld->type.length);
   1218       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
   1219    }
   1220    else {
   1221       return lp_build_mul_32_lohi(bld, a, b, res_hi);
   1222    }
   1223 }
   1224 
   1225 
   1226 /*
   1227  * Widening mul, valid for 32x32 bit -> 64bit only.
   1228  * Result is low 32bits, high bits returned in res_hi.
   1229  *
   1230  * Emits generic code.
   1231  */
   1232 LLVMValueRef
   1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
   1234                      LLVMValueRef a,
   1235                      LLVMValueRef b,
   1236                      LLVMValueRef *res_hi)
   1237 {
   1238    struct gallivm_state *gallivm = bld->gallivm;
   1239    LLVMBuilderRef builder = gallivm->builder;
   1240    LLVMValueRef tmp, shift, res_lo;
   1241    struct lp_type type_tmp;
   1242    LLVMTypeRef wide_type, narrow_type;
   1243 
   1244    type_tmp = bld->type;
   1245    narrow_type = lp_build_vec_type(gallivm, type_tmp);
   1246    type_tmp.width *= 2;
   1247    wide_type = lp_build_vec_type(gallivm, type_tmp);
   1248    shift = lp_build_const_vec(gallivm, type_tmp, 32);
   1249 
   1250    if (bld->type.sign) {
   1251       a = LLVMBuildSExt(builder, a, wide_type, "");
   1252       b = LLVMBuildSExt(builder, b, wide_type, "");
   1253    } else {
   1254       a = LLVMBuildZExt(builder, a, wide_type, "");
   1255       b = LLVMBuildZExt(builder, b, wide_type, "");
   1256    }
   1257    tmp = LLVMBuildMul(builder, a, b, "");
   1258 
   1259    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
   1260 
   1261    /* Since we truncate anyway, LShr and AShr are equivalent. */
   1262    tmp = LLVMBuildLShr(builder, tmp, shift, "");
   1263    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
   1264 
   1265    return res_lo;
   1266 }
   1267 
   1268 
   1269 /* a * b + c */
   1270 LLVMValueRef
   1271 lp_build_mad(struct lp_build_context *bld,
   1272              LLVMValueRef a,
   1273              LLVMValueRef b,
   1274              LLVMValueRef c)
   1275 {
   1276    const struct lp_type type = bld->type;
   1277    if (type.floating) {
   1278       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
   1279    } else {
   1280       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
   1281    }
   1282 }
   1283 
   1284 
   1285 /**
   1286  * Small vector x scale multiplication optimization.
   1287  */
   1288 LLVMValueRef
   1289 lp_build_mul_imm(struct lp_build_context *bld,
   1290                  LLVMValueRef a,
   1291                  int b)
   1292 {
   1293    LLVMBuilderRef builder = bld->gallivm->builder;
   1294    LLVMValueRef factor;
   1295 
   1296    assert(lp_check_value(bld->type, a));
   1297 
   1298    if(b == 0)
   1299       return bld->zero;
   1300 
   1301    if(b == 1)
   1302       return a;
   1303 
   1304    if(b == -1)
   1305       return lp_build_negate(bld, a);
   1306 
   1307    if(b == 2 && bld->type.floating)
   1308       return lp_build_add(bld, a, a);
   1309 
   1310    if(util_is_power_of_two(b)) {
   1311       unsigned shift = ffs(b) - 1;
   1312 
   1313       if(bld->type.floating) {
   1314 #if 0
   1315          /*
   1316           * Power of two multiplication by directly manipulating the exponent.
   1317           *
   1318           * XXX: This might not be always faster, it will introduce a small error
   1319           * for multiplication by zero, and it will produce wrong results
   1320           * for Inf and NaN.
   1321           */
   1322          unsigned mantissa = lp_mantissa(bld->type);
   1323          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
   1324          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
   1325          a = LLVMBuildAdd(builder, a, factor, "");
   1326          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
   1327          return a;
   1328 #endif
   1329       }
   1330       else {
   1331          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
   1332          return LLVMBuildShl(builder, a, factor, "");
   1333       }
   1334    }
   1335 
   1336    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
   1337    return lp_build_mul(bld, a, factor);
   1338 }
   1339 
   1340 
   1341 /**
   1342  * Generate a / b
   1343  */
   1344 LLVMValueRef
   1345 lp_build_div(struct lp_build_context *bld,
   1346              LLVMValueRef a,
   1347              LLVMValueRef b)
   1348 {
   1349    LLVMBuilderRef builder = bld->gallivm->builder;
   1350    const struct lp_type type = bld->type;
   1351 
   1352    assert(lp_check_value(type, a));
   1353    assert(lp_check_value(type, b));
   1354 
   1355    if(a == bld->zero)
   1356       return bld->zero;
   1357    if(a == bld->one && type.floating)
   1358       return lp_build_rcp(bld, b);
   1359    if(b == bld->zero)
   1360       return bld->undef;
   1361    if(b == bld->one)
   1362       return a;
   1363    if(a == bld->undef || b == bld->undef)
   1364       return bld->undef;
   1365 
   1366    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
   1367       if (type.floating)
   1368          return LLVMConstFDiv(a, b);
   1369       else if (type.sign)
   1370          return LLVMConstSDiv(a, b);
   1371       else
   1372          return LLVMConstUDiv(a, b);
   1373    }
   1374 
   1375    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   1376        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
   1377       type.floating)
   1378       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
   1379 
   1380    if (type.floating)
   1381       return LLVMBuildFDiv(builder, a, b, "");
   1382    else if (type.sign)
   1383       return LLVMBuildSDiv(builder, a, b, "");
   1384    else
   1385       return LLVMBuildUDiv(builder, a, b, "");
   1386 }
   1387 
   1388 
   1389 /**
   1390  * Linear interpolation helper.
   1391  *
   1392  * @param normalized whether we are interpolating normalized values,
   1393  *        encoded in normalized integers, twice as wide.
   1394  *
   1395  * @sa http://www.stereopsis.com/doubleblend.html
   1396  */
   1397 static inline LLVMValueRef
   1398 lp_build_lerp_simple(struct lp_build_context *bld,
   1399                      LLVMValueRef x,
   1400                      LLVMValueRef v0,
   1401                      LLVMValueRef v1,
   1402                      unsigned flags)
   1403 {
   1404    unsigned half_width = bld->type.width/2;
   1405    LLVMBuilderRef builder = bld->gallivm->builder;
   1406    LLVMValueRef delta;
   1407    LLVMValueRef res;
   1408 
   1409    assert(lp_check_value(bld->type, x));
   1410    assert(lp_check_value(bld->type, v0));
   1411    assert(lp_check_value(bld->type, v1));
   1412 
   1413    delta = lp_build_sub(bld, v1, v0);
   1414 
   1415    if (bld->type.floating) {
   1416       assert(flags == 0);
   1417       return lp_build_mad(bld, x, delta, v0);
   1418    }
   1419 
   1420    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
   1421       if (!bld->type.sign) {
   1422          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
   1423             /*
   1424              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
   1425              * most-significant-bit to the lowest-significant-bit, so that
   1426              * later we can just divide by 2**n instead of 2**n - 1.
   1427              */
   1428 
   1429             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
   1430          }
   1431 
   1432          /* (x * delta) >> n */
   1433          res = lp_build_mul(bld, x, delta);
   1434          res = lp_build_shr_imm(bld, res, half_width);
   1435       } else {
   1436          /*
   1437           * The rescaling trick above doesn't work for signed numbers, so
   1438           * use the 2**n - 1 divison approximation in lp_build_mul_norm
   1439           * instead.
   1440           */
   1441          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
   1442          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
   1443       }
   1444    } else {
   1445       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
   1446       res = lp_build_mul(bld, x, delta);
   1447    }
   1448 
   1449    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
   1450       /*
   1451        * At this point both res and v0 only use the lower half of the bits,
   1452        * the rest is zero. Instead of add / mask, do add with half wide type.
   1453        */
   1454       struct lp_type narrow_type;
   1455       struct lp_build_context narrow_bld;
   1456 
   1457       memset(&narrow_type, 0, sizeof narrow_type);
   1458       narrow_type.sign   = bld->type.sign;
   1459       narrow_type.width  = bld->type.width/2;
   1460       narrow_type.length = bld->type.length*2;
   1461 
   1462       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
   1463       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
   1464       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
   1465       res = lp_build_add(&narrow_bld, v0, res);
   1466       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
   1467    } else {
   1468       res = lp_build_add(bld, v0, res);
   1469 
   1470       if (bld->type.fixed) {
   1471          /*
   1472           * We need to mask out the high order bits when lerping 8bit
   1473           * normalized colors stored on 16bits
   1474           */
   1475          /* XXX: This step is necessary for lerping 8bit colors stored on
   1476           * 16bits, but it will be wrong for true fixed point use cases.
   1477           * Basically we need a more powerful lp_type, capable of further
   1478           * distinguishing the values interpretation from the value storage.
   1479           */
   1480          LLVMValueRef low_bits;
   1481          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
   1482          res = LLVMBuildAnd(builder, res, low_bits, "");
   1483       }
   1484    }
   1485 
   1486    return res;
   1487 }
   1488 
   1489 
   1490 /**
   1491  * Linear interpolation.
   1492  */
   1493 LLVMValueRef
   1494 lp_build_lerp(struct lp_build_context *bld,
   1495               LLVMValueRef x,
   1496               LLVMValueRef v0,
   1497               LLVMValueRef v1,
   1498               unsigned flags)
   1499 {
   1500    const struct lp_type type = bld->type;
   1501    LLVMValueRef res;
   1502 
   1503    assert(lp_check_value(type, x));
   1504    assert(lp_check_value(type, v0));
   1505    assert(lp_check_value(type, v1));
   1506 
   1507    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
   1508 
   1509    if (type.norm) {
   1510       struct lp_type wide_type;
   1511       struct lp_build_context wide_bld;
   1512       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
   1513 
   1514       assert(type.length >= 2);
   1515 
   1516       /*
   1517        * Create a wider integer type, enough to hold the
   1518        * intermediate result of the multiplication.
   1519        */
   1520       memset(&wide_type, 0, sizeof wide_type);
   1521       wide_type.sign   = type.sign;
   1522       wide_type.width  = type.width*2;
   1523       wide_type.length = type.length/2;
   1524 
   1525       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
   1526 
   1527       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
   1528       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
   1529       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
   1530 
   1531       /*
   1532        * Lerp both halves.
   1533        */
   1534 
   1535       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
   1536 
   1537       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
   1538       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
   1539 
   1540       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
   1541    } else {
   1542       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
   1543    }
   1544 
   1545    return res;
   1546 }
   1547 
   1548 
   1549 /**
   1550  * Bilinear interpolation.
   1551  *
   1552  * Values indices are in v_{yx}.
   1553  */
   1554 LLVMValueRef
   1555 lp_build_lerp_2d(struct lp_build_context *bld,
   1556                  LLVMValueRef x,
   1557                  LLVMValueRef y,
   1558                  LLVMValueRef v00,
   1559                  LLVMValueRef v01,
   1560                  LLVMValueRef v10,
   1561                  LLVMValueRef v11,
   1562                  unsigned flags)
   1563 {
   1564    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
   1565    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
   1566    return lp_build_lerp(bld, y, v0, v1, flags);
   1567 }
   1568 
   1569 
   1570 LLVMValueRef
   1571 lp_build_lerp_3d(struct lp_build_context *bld,
   1572                  LLVMValueRef x,
   1573                  LLVMValueRef y,
   1574                  LLVMValueRef z,
   1575                  LLVMValueRef v000,
   1576                  LLVMValueRef v001,
   1577                  LLVMValueRef v010,
   1578                  LLVMValueRef v011,
   1579                  LLVMValueRef v100,
   1580                  LLVMValueRef v101,
   1581                  LLVMValueRef v110,
   1582                  LLVMValueRef v111,
   1583                  unsigned flags)
   1584 {
   1585    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
   1586    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
   1587    return lp_build_lerp(bld, z, v0, v1, flags);
   1588 }
   1589 
   1590 
   1591 /**
   1592  * Generate min(a, b)
   1593  * Do checks for special cases but not for nans.
   1594  */
   1595 LLVMValueRef
   1596 lp_build_min(struct lp_build_context *bld,
   1597              LLVMValueRef a,
   1598              LLVMValueRef b)
   1599 {
   1600    assert(lp_check_value(bld->type, a));
   1601    assert(lp_check_value(bld->type, b));
   1602 
   1603    if(a == bld->undef || b == bld->undef)
   1604       return bld->undef;
   1605 
   1606    if(a == b)
   1607       return a;
   1608 
   1609    if (bld->type.norm) {
   1610       if (!bld->type.sign) {
   1611          if (a == bld->zero || b == bld->zero) {
   1612             return bld->zero;
   1613          }
   1614       }
   1615       if(a == bld->one)
   1616          return b;
   1617       if(b == bld->one)
   1618          return a;
   1619    }
   1620 
   1621    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
   1622 }
   1623 
   1624 
   1625 /**
   1626  * Generate min(a, b)
   1627  * NaN's are handled according to the behavior specified by the
   1628  * nan_behavior argument.
   1629  */
   1630 LLVMValueRef
   1631 lp_build_min_ext(struct lp_build_context *bld,
   1632                  LLVMValueRef a,
   1633                  LLVMValueRef b,
   1634                  enum gallivm_nan_behavior nan_behavior)
   1635 {
   1636    assert(lp_check_value(bld->type, a));
   1637    assert(lp_check_value(bld->type, b));
   1638 
   1639    if(a == bld->undef || b == bld->undef)
   1640       return bld->undef;
   1641 
   1642    if(a == b)
   1643       return a;
   1644 
   1645    if (bld->type.norm) {
   1646       if (!bld->type.sign) {
   1647          if (a == bld->zero || b == bld->zero) {
   1648             return bld->zero;
   1649          }
   1650       }
   1651       if(a == bld->one)
   1652          return b;
   1653       if(b == bld->one)
   1654          return a;
   1655    }
   1656 
   1657    return lp_build_min_simple(bld, a, b, nan_behavior);
   1658 }
   1659 
   1660 /**
   1661  * Generate max(a, b)
   1662  * Do checks for special cases, but NaN behavior is undefined.
   1663  */
   1664 LLVMValueRef
   1665 lp_build_max(struct lp_build_context *bld,
   1666              LLVMValueRef a,
   1667              LLVMValueRef b)
   1668 {
   1669    assert(lp_check_value(bld->type, a));
   1670    assert(lp_check_value(bld->type, b));
   1671 
   1672    if(a == bld->undef || b == bld->undef)
   1673       return bld->undef;
   1674 
   1675    if(a == b)
   1676       return a;
   1677 
   1678    if(bld->type.norm) {
   1679       if(a == bld->one || b == bld->one)
   1680          return bld->one;
   1681       if (!bld->type.sign) {
   1682          if (a == bld->zero) {
   1683             return b;
   1684          }
   1685          if (b == bld->zero) {
   1686             return a;
   1687          }
   1688       }
   1689    }
   1690 
   1691    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
   1692 }
   1693 
   1694 
   1695 /**
   1696  * Generate max(a, b)
   1697  * Checks for special cases.
   1698  * NaN's are handled according to the behavior specified by the
   1699  * nan_behavior argument.
   1700  */
   1701 LLVMValueRef
   1702 lp_build_max_ext(struct lp_build_context *bld,
   1703                   LLVMValueRef a,
   1704                   LLVMValueRef b,
   1705                   enum gallivm_nan_behavior nan_behavior)
   1706 {
   1707    assert(lp_check_value(bld->type, a));
   1708    assert(lp_check_value(bld->type, b));
   1709 
   1710    if(a == bld->undef || b == bld->undef)
   1711       return bld->undef;
   1712 
   1713    if(a == b)
   1714       return a;
   1715 
   1716    if(bld->type.norm) {
   1717       if(a == bld->one || b == bld->one)
   1718          return bld->one;
   1719       if (!bld->type.sign) {
   1720          if (a == bld->zero) {
   1721             return b;
   1722          }
   1723          if (b == bld->zero) {
   1724             return a;
   1725          }
   1726       }
   1727    }
   1728 
   1729    return lp_build_max_simple(bld, a, b, nan_behavior);
   1730 }
   1731 
   1732 /**
   1733  * Generate clamp(a, min, max)
   1734  * NaN behavior (for any of a, min, max) is undefined.
   1735  * Do checks for special cases.
   1736  */
   1737 LLVMValueRef
   1738 lp_build_clamp(struct lp_build_context *bld,
   1739                LLVMValueRef a,
   1740                LLVMValueRef min,
   1741                LLVMValueRef max)
   1742 {
   1743    assert(lp_check_value(bld->type, a));
   1744    assert(lp_check_value(bld->type, min));
   1745    assert(lp_check_value(bld->type, max));
   1746 
   1747    a = lp_build_min(bld, a, max);
   1748    a = lp_build_max(bld, a, min);
   1749    return a;
   1750 }
   1751 
   1752 
   1753 /**
   1754  * Generate clamp(a, 0, 1)
   1755  * A NaN will get converted to zero.
   1756  */
   1757 LLVMValueRef
   1758 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
   1759                                 LLVMValueRef a)
   1760 {
   1761    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   1762    a = lp_build_min(bld, a, bld->one);
   1763    return a;
   1764 }
   1765 
   1766 
   1767 /**
   1768  * Generate abs(a)
   1769  */
   1770 LLVMValueRef
   1771 lp_build_abs(struct lp_build_context *bld,
   1772              LLVMValueRef a)
   1773 {
   1774    LLVMBuilderRef builder = bld->gallivm->builder;
   1775    const struct lp_type type = bld->type;
   1776    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1777 
   1778    assert(lp_check_value(type, a));
   1779 
   1780    if(!type.sign)
   1781       return a;
   1782 
   1783    if(type.floating) {
   1784       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
   1785          /* Workaround llvm.org/PR27332 */
   1786          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1787          unsigned long long absMask = ~(1ULL << (type.width - 1));
   1788          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
   1789          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1790          a = LLVMBuildAnd(builder, a, mask, "");
   1791          a = LLVMBuildBitCast(builder, a, vec_type, "");
   1792          return a;
   1793       } else {
   1794          char intrinsic[32];
   1795          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
   1796          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   1797       }
   1798    }
   1799 
   1800    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
   1801       switch(type.width) {
   1802       case 8:
   1803          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
   1804       case 16:
   1805          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
   1806       case 32:
   1807          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
   1808       }
   1809    }
   1810    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
   1811       switch(type.width) {
   1812       case 8:
   1813          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
   1814       case 16:
   1815          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
   1816       case 32:
   1817          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
   1818       }
   1819    }
   1820    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
   1821             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
   1822             (type.width == 8 || type.width == 16 || type.width == 32)) {
   1823       debug_printf("%s: inefficient code, should split vectors manually\n",
   1824                    __FUNCTION__);
   1825    }
   1826 
   1827    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
   1828 }
   1829 
   1830 
   1831 LLVMValueRef
   1832 lp_build_negate(struct lp_build_context *bld,
   1833                 LLVMValueRef a)
   1834 {
   1835    LLVMBuilderRef builder = bld->gallivm->builder;
   1836 
   1837    assert(lp_check_value(bld->type, a));
   1838 
   1839    if (bld->type.floating)
   1840       a = LLVMBuildFNeg(builder, a, "");
   1841    else
   1842       a = LLVMBuildNeg(builder, a, "");
   1843 
   1844    return a;
   1845 }
   1846 
   1847 
   1848 /** Return -1, 0 or +1 depending on the sign of a */
   1849 LLVMValueRef
   1850 lp_build_sgn(struct lp_build_context *bld,
   1851              LLVMValueRef a)
   1852 {
   1853    LLVMBuilderRef builder = bld->gallivm->builder;
   1854    const struct lp_type type = bld->type;
   1855    LLVMValueRef cond;
   1856    LLVMValueRef res;
   1857 
   1858    assert(lp_check_value(type, a));
   1859 
   1860    /* Handle non-zero case */
   1861    if(!type.sign) {
   1862       /* if not zero then sign must be positive */
   1863       res = bld->one;
   1864    }
   1865    else if(type.floating) {
   1866       LLVMTypeRef vec_type;
   1867       LLVMTypeRef int_type;
   1868       LLVMValueRef mask;
   1869       LLVMValueRef sign;
   1870       LLVMValueRef one;
   1871       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
   1872 
   1873       int_type = lp_build_int_vec_type(bld->gallivm, type);
   1874       vec_type = lp_build_vec_type(bld->gallivm, type);
   1875       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
   1876 
   1877       /* Take the sign bit and add it to 1 constant */
   1878       sign = LLVMBuildBitCast(builder, a, int_type, "");
   1879       sign = LLVMBuildAnd(builder, sign, mask, "");
   1880       one = LLVMConstBitCast(bld->one, int_type);
   1881       res = LLVMBuildOr(builder, sign, one, "");
   1882       res = LLVMBuildBitCast(builder, res, vec_type, "");
   1883    }
   1884    else
   1885    {
   1886       /* signed int/norm/fixed point */
   1887       /* could use psign with sse3 and appropriate vectors here */
   1888       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
   1889       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
   1890       res = lp_build_select(bld, cond, bld->one, minus_one);
   1891    }
   1892 
   1893    /* Handle zero */
   1894    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
   1895    res = lp_build_select(bld, cond, bld->zero, res);
   1896 
   1897    return res;
   1898 }
   1899 
   1900 
   1901 /**
   1902  * Set the sign of float vector 'a' according to 'sign'.
   1903  * If sign==0, return abs(a).
   1904  * If sign==1, return -abs(a);
   1905  * Other values for sign produce undefined results.
   1906  */
   1907 LLVMValueRef
   1908 lp_build_set_sign(struct lp_build_context *bld,
   1909                   LLVMValueRef a, LLVMValueRef sign)
   1910 {
   1911    LLVMBuilderRef builder = bld->gallivm->builder;
   1912    const struct lp_type type = bld->type;
   1913    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1914    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1915    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
   1916    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   1917                              ~((unsigned long long) 1 << (type.width - 1)));
   1918    LLVMValueRef val, res;
   1919 
   1920    assert(type.floating);
   1921    assert(lp_check_value(type, a));
   1922 
   1923    /* val = reinterpret_cast<int>(a) */
   1924    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1925    /* val = val & mask */
   1926    val = LLVMBuildAnd(builder, val, mask, "");
   1927    /* sign = sign << shift */
   1928    sign = LLVMBuildShl(builder, sign, shift, "");
   1929    /* res = val | sign */
   1930    res = LLVMBuildOr(builder, val, sign, "");
   1931    /* res = reinterpret_cast<float>(res) */
   1932    res = LLVMBuildBitCast(builder, res, vec_type, "");
   1933 
   1934    return res;
   1935 }
   1936 
   1937 
   1938 /**
   1939  * Convert vector of (or scalar) int to vector of (or scalar) float.
   1940  */
   1941 LLVMValueRef
   1942 lp_build_int_to_float(struct lp_build_context *bld,
   1943                       LLVMValueRef a)
   1944 {
   1945    LLVMBuilderRef builder = bld->gallivm->builder;
   1946    const struct lp_type type = bld->type;
   1947    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1948 
   1949    assert(type.floating);
   1950 
   1951    return LLVMBuildSIToFP(builder, a, vec_type, "");
   1952 }
   1953 
   1954 static boolean
   1955 arch_rounding_available(const struct lp_type type)
   1956 {
   1957    if ((util_cpu_caps.has_sse4_1 &&
   1958        (type.length == 1 || type.width*type.length == 128)) ||
   1959        (util_cpu_caps.has_avx && type.width*type.length == 256))
   1960       return TRUE;
   1961    else if ((util_cpu_caps.has_altivec &&
   1962             (type.width == 32 && type.length == 4)))
   1963       return TRUE;
   1964 
   1965    return FALSE;
   1966 }
   1967 
   1968 enum lp_build_round_mode
   1969 {
   1970    LP_BUILD_ROUND_NEAREST = 0,
   1971    LP_BUILD_ROUND_FLOOR = 1,
   1972    LP_BUILD_ROUND_CEIL = 2,
   1973    LP_BUILD_ROUND_TRUNCATE = 3
   1974 };
   1975 
   1976 static inline LLVMValueRef
   1977 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
   1978                              LLVMValueRef a)
   1979 {
   1980    LLVMBuilderRef builder = bld->gallivm->builder;
   1981    const struct lp_type type = bld->type;
   1982    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   1983    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
   1984    const char *intrinsic;
   1985    LLVMValueRef res;
   1986 
   1987    assert(type.floating);
   1988    /* using the double precision conversions is a bit more complicated */
   1989    assert(type.width == 32);
   1990 
   1991    assert(lp_check_value(type, a));
   1992    assert(util_cpu_caps.has_sse2);
   1993 
   1994    /* This is relying on MXCSR rounding mode, which should always be nearest. */
   1995    if (type.length == 1) {
   1996       LLVMTypeRef vec_type;
   1997       LLVMValueRef undef;
   1998       LLVMValueRef arg;
   1999       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
   2000 
   2001       vec_type = LLVMVectorType(bld->elem_type, 4);
   2002 
   2003       intrinsic = "llvm.x86.sse.cvtss2si";
   2004 
   2005       undef = LLVMGetUndef(vec_type);
   2006 
   2007       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
   2008 
   2009       res = lp_build_intrinsic_unary(builder, intrinsic,
   2010                                      ret_type, arg);
   2011    }
   2012    else {
   2013       if (type.width* type.length == 128) {
   2014          intrinsic = "llvm.x86.sse2.cvtps2dq";
   2015       }
   2016       else {
   2017          assert(type.width*type.length == 256);
   2018          assert(util_cpu_caps.has_avx);
   2019 
   2020          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
   2021       }
   2022       res = lp_build_intrinsic_unary(builder, intrinsic,
   2023                                      ret_type, a);
   2024    }
   2025 
   2026    return res;
   2027 }
   2028 
   2029 
   2030 /*
   2031  */
   2032 static inline LLVMValueRef
   2033 lp_build_round_altivec(struct lp_build_context *bld,
   2034                        LLVMValueRef a,
   2035                        enum lp_build_round_mode mode)
   2036 {
   2037    LLVMBuilderRef builder = bld->gallivm->builder;
   2038    const struct lp_type type = bld->type;
   2039    const char *intrinsic = NULL;
   2040 
   2041    assert(type.floating);
   2042 
   2043    assert(lp_check_value(type, a));
   2044    assert(util_cpu_caps.has_altivec);
   2045 
   2046    (void)type;
   2047 
   2048    switch (mode) {
   2049    case LP_BUILD_ROUND_NEAREST:
   2050       intrinsic = "llvm.ppc.altivec.vrfin";
   2051       break;
   2052    case LP_BUILD_ROUND_FLOOR:
   2053       intrinsic = "llvm.ppc.altivec.vrfim";
   2054       break;
   2055    case LP_BUILD_ROUND_CEIL:
   2056       intrinsic = "llvm.ppc.altivec.vrfip";
   2057       break;
   2058    case LP_BUILD_ROUND_TRUNCATE:
   2059       intrinsic = "llvm.ppc.altivec.vrfiz";
   2060       break;
   2061    }
   2062 
   2063    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2064 }
   2065 
   2066 static inline LLVMValueRef
   2067 lp_build_round_arch(struct lp_build_context *bld,
   2068                     LLVMValueRef a,
   2069                     enum lp_build_round_mode mode)
   2070 {
   2071    if (util_cpu_caps.has_sse4_1) {
   2072       LLVMBuilderRef builder = bld->gallivm->builder;
   2073       const struct lp_type type = bld->type;
   2074       const char *intrinsic_root;
   2075       char intrinsic[32];
   2076 
   2077       assert(type.floating);
   2078       assert(lp_check_value(type, a));
   2079       (void)type;
   2080 
   2081       switch (mode) {
   2082       case LP_BUILD_ROUND_NEAREST:
   2083          intrinsic_root = "llvm.nearbyint";
   2084          break;
   2085       case LP_BUILD_ROUND_FLOOR:
   2086          intrinsic_root = "llvm.floor";
   2087          break;
   2088       case LP_BUILD_ROUND_CEIL:
   2089          intrinsic_root = "llvm.ceil";
   2090          break;
   2091       case LP_BUILD_ROUND_TRUNCATE:
   2092          intrinsic_root = "llvm.trunc";
   2093          break;
   2094       }
   2095 
   2096       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
   2097       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2098    }
   2099    else /* (util_cpu_caps.has_altivec) */
   2100      return lp_build_round_altivec(bld, a, mode);
   2101 }
   2102 
   2103 /**
   2104  * Return the integer part of a float (vector) value (== round toward zero).
   2105  * The returned value is a float (vector).
   2106  * Ex: trunc(-1.5) = -1.0
   2107  */
   2108 LLVMValueRef
   2109 lp_build_trunc(struct lp_build_context *bld,
   2110                LLVMValueRef a)
   2111 {
   2112    LLVMBuilderRef builder = bld->gallivm->builder;
   2113    const struct lp_type type = bld->type;
   2114 
   2115    assert(type.floating);
   2116    assert(lp_check_value(type, a));
   2117 
   2118    if (arch_rounding_available(type)) {
   2119       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
   2120    }
   2121    else {
   2122       const struct lp_type type = bld->type;
   2123       struct lp_type inttype;
   2124       struct lp_build_context intbld;
   2125       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2126       LLVMValueRef trunc, res, anosign, mask;
   2127       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2128       LLVMTypeRef vec_type = bld->vec_type;
   2129 
   2130       assert(type.width == 32); /* might want to handle doubles at some point */
   2131 
   2132       inttype = type;
   2133       inttype.floating = 0;
   2134       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2135 
   2136       /* round by truncation */
   2137       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2138       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
   2139 
   2140       /* mask out sign bit */
   2141       anosign = lp_build_abs(bld, a);
   2142       /*
   2143        * mask out all values if anosign > 2^24
   2144        * This should work both for large ints (all rounding is no-op for them
   2145        * because such floats are always exact) as well as special cases like
   2146        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2147        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2148        */
   2149       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2150       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2151       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2152       return lp_build_select(bld, mask, a, res);
   2153    }
   2154 }
   2155 
   2156 
   2157 /**
   2158  * Return float (vector) rounded to nearest integer (vector).  The returned
   2159  * value is a float (vector).
   2160  * Ex: round(0.9) = 1.0
   2161  * Ex: round(-1.5) = -2.0
   2162  */
   2163 LLVMValueRef
   2164 lp_build_round(struct lp_build_context *bld,
   2165                LLVMValueRef a)
   2166 {
   2167    LLVMBuilderRef builder = bld->gallivm->builder;
   2168    const struct lp_type type = bld->type;
   2169 
   2170    assert(type.floating);
   2171    assert(lp_check_value(type, a));
   2172 
   2173    if (arch_rounding_available(type)) {
   2174       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
   2175    }
   2176    else {
   2177       const struct lp_type type = bld->type;
   2178       struct lp_type inttype;
   2179       struct lp_build_context intbld;
   2180       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2181       LLVMValueRef res, anosign, mask;
   2182       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2183       LLVMTypeRef vec_type = bld->vec_type;
   2184 
   2185       assert(type.width == 32); /* might want to handle doubles at some point */
   2186 
   2187       inttype = type;
   2188       inttype.floating = 0;
   2189       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2190 
   2191       res = lp_build_iround(bld, a);
   2192       res = LLVMBuildSIToFP(builder, res, vec_type, "");
   2193 
   2194       /* mask out sign bit */
   2195       anosign = lp_build_abs(bld, a);
   2196       /*
   2197        * mask out all values if anosign > 2^24
   2198        * This should work both for large ints (all rounding is no-op for them
   2199        * because such floats are always exact) as well as special cases like
   2200        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2201        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2202        */
   2203       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2204       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2205       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2206       return lp_build_select(bld, mask, a, res);
   2207    }
   2208 }
   2209 
   2210 
   2211 /**
   2212  * Return floor of float (vector), result is a float (vector)
   2213  * Ex: floor(1.1) = 1.0
   2214  * Ex: floor(-1.1) = -2.0
   2215  */
   2216 LLVMValueRef
   2217 lp_build_floor(struct lp_build_context *bld,
   2218                LLVMValueRef a)
   2219 {
   2220    LLVMBuilderRef builder = bld->gallivm->builder;
   2221    const struct lp_type type = bld->type;
   2222 
   2223    assert(type.floating);
   2224    assert(lp_check_value(type, a));
   2225 
   2226    if (arch_rounding_available(type)) {
   2227       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
   2228    }
   2229    else {
   2230       const struct lp_type type = bld->type;
   2231       struct lp_type inttype;
   2232       struct lp_build_context intbld;
   2233       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2234       LLVMValueRef trunc, res, anosign, mask;
   2235       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2236       LLVMTypeRef vec_type = bld->vec_type;
   2237 
   2238       if (type.width != 32) {
   2239          char intrinsic[32];
   2240          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
   2241          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   2242       }
   2243 
   2244       assert(type.width == 32); /* might want to handle doubles at some point */
   2245 
   2246       inttype = type;
   2247       inttype.floating = 0;
   2248       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2249 
   2250       /* round by truncation */
   2251       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2252       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
   2253 
   2254       if (type.sign) {
   2255          LLVMValueRef tmp;
   2256 
   2257          /*
   2258           * fix values if rounding is wrong (for non-special cases)
   2259           * - this is the case if trunc > a
   2260           */
   2261          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
   2262          /* tmp = trunc > a ? 1.0 : 0.0 */
   2263          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
   2264          tmp = lp_build_and(&intbld, mask, tmp);
   2265          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
   2266          res = lp_build_sub(bld, res, tmp);
   2267       }
   2268 
   2269       /* mask out sign bit */
   2270       anosign = lp_build_abs(bld, a);
   2271       /*
   2272        * mask out all values if anosign > 2^24
   2273        * This should work both for large ints (all rounding is no-op for them
   2274        * because such floats are always exact) as well as special cases like
   2275        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2276        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2277        */
   2278       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2279       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2280       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2281       return lp_build_select(bld, mask, a, res);
   2282    }
   2283 }
   2284 
   2285 
   2286 /**
   2287  * Return ceiling of float (vector), returning float (vector).
   2288  * Ex: ceil( 1.1) = 2.0
   2289  * Ex: ceil(-1.1) = -1.0
   2290  */
   2291 LLVMValueRef
   2292 lp_build_ceil(struct lp_build_context *bld,
   2293               LLVMValueRef a)
   2294 {
   2295    LLVMBuilderRef builder = bld->gallivm->builder;
   2296    const struct lp_type type = bld->type;
   2297 
   2298    assert(type.floating);
   2299    assert(lp_check_value(type, a));
   2300 
   2301    if (arch_rounding_available(type)) {
   2302       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
   2303    }
   2304    else {
   2305       const struct lp_type type = bld->type;
   2306       struct lp_type inttype;
   2307       struct lp_build_context intbld;
   2308       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2309       LLVMValueRef trunc, res, anosign, mask, tmp;
   2310       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2311       LLVMTypeRef vec_type = bld->vec_type;
   2312 
   2313       if (type.width != 32) {
   2314          char intrinsic[32];
   2315          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
   2316          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   2317       }
   2318 
   2319       assert(type.width == 32); /* might want to handle doubles at some point */
   2320 
   2321       inttype = type;
   2322       inttype.floating = 0;
   2323       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2324 
   2325       /* round by truncation */
   2326       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2327       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
   2328 
   2329       /*
   2330        * fix values if rounding is wrong (for non-special cases)
   2331        * - this is the case if trunc < a
   2332        */
   2333       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
   2334       /* tmp = trunc < a ? 1.0 : 0.0 */
   2335       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
   2336       tmp = lp_build_and(&intbld, mask, tmp);
   2337       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
   2338       res = lp_build_add(bld, trunc, tmp);
   2339 
   2340       /* mask out sign bit */
   2341       anosign = lp_build_abs(bld, a);
   2342       /*
   2343        * mask out all values if anosign > 2^24
   2344        * This should work both for large ints (all rounding is no-op for them
   2345        * because such floats are always exact) as well as special cases like
   2346        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2347        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2348        */
   2349       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2350       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2351       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2352       return lp_build_select(bld, mask, a, res);
   2353    }
   2354 }
   2355 
   2356 
   2357 /**
   2358  * Return fractional part of 'a' computed as a - floor(a)
   2359  * Typically used in texture coord arithmetic.
   2360  */
   2361 LLVMValueRef
   2362 lp_build_fract(struct lp_build_context *bld,
   2363                LLVMValueRef a)
   2364 {
   2365    assert(bld->type.floating);
   2366    return lp_build_sub(bld, a, lp_build_floor(bld, a));
   2367 }
   2368 
   2369 
   2370 /**
   2371  * Prevent returning 1.0 for very small negative values of 'a' by clamping
   2372  * against 0.99999(9). (Will also return that value for NaNs.)
   2373  */
   2374 static inline LLVMValueRef
   2375 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
   2376 {
   2377    LLVMValueRef max;
   2378 
   2379    /* this is the largest number smaller than 1.0 representable as float */
   2380    max = lp_build_const_vec(bld->gallivm, bld->type,
   2381                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
   2382    return lp_build_min_ext(bld, fract, max,
   2383                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   2384 }
   2385 
   2386 
   2387 /**
   2388  * Same as lp_build_fract, but guarantees that the result is always smaller
   2389  * than one. Will also return the smaller-than-one value for infs, NaNs.
   2390  */
   2391 LLVMValueRef
   2392 lp_build_fract_safe(struct lp_build_context *bld,
   2393                     LLVMValueRef a)
   2394 {
   2395    return clamp_fract(bld, lp_build_fract(bld, a));
   2396 }
   2397 
   2398 
   2399 /**
   2400  * Return the integer part of a float (vector) value (== round toward zero).
   2401  * The returned value is an integer (vector).
   2402  * Ex: itrunc(-1.5) = -1
   2403  */
   2404 LLVMValueRef
   2405 lp_build_itrunc(struct lp_build_context *bld,
   2406                 LLVMValueRef a)
   2407 {
   2408    LLVMBuilderRef builder = bld->gallivm->builder;
   2409    const struct lp_type type = bld->type;
   2410    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   2411 
   2412    assert(type.floating);
   2413    assert(lp_check_value(type, a));
   2414 
   2415    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2416 }
   2417 
   2418 
   2419 /**
   2420  * Return float (vector) rounded to nearest integer (vector).  The returned
   2421  * value is an integer (vector).
   2422  * Ex: iround(0.9) = 1
   2423  * Ex: iround(-1.5) = -2
   2424  */
   2425 LLVMValueRef
   2426 lp_build_iround(struct lp_build_context *bld,
   2427                 LLVMValueRef a)
   2428 {
   2429    LLVMBuilderRef builder = bld->gallivm->builder;
   2430    const struct lp_type type = bld->type;
   2431    LLVMTypeRef int_vec_type = bld->int_vec_type;
   2432    LLVMValueRef res;
   2433 
   2434    assert(type.floating);
   2435 
   2436    assert(lp_check_value(type, a));
   2437 
   2438    if ((util_cpu_caps.has_sse2 &&
   2439        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
   2440        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
   2441       return lp_build_iround_nearest_sse2(bld, a);
   2442    }
   2443    if (arch_rounding_available(type)) {
   2444       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
   2445    }
   2446    else {
   2447       LLVMValueRef half;
   2448 
   2449       half = lp_build_const_vec(bld->gallivm, type, 0.5);
   2450 
   2451       if (type.sign) {
   2452          LLVMTypeRef vec_type = bld->vec_type;
   2453          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   2454                                     (unsigned long long)1 << (type.width - 1));
   2455          LLVMValueRef sign;
   2456 
   2457          /* get sign bit */
   2458          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
   2459          sign = LLVMBuildAnd(builder, sign, mask, "");
   2460 
   2461          /* sign * 0.5 */
   2462          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
   2463          half = LLVMBuildOr(builder, sign, half, "");
   2464          half = LLVMBuildBitCast(builder, half, vec_type, "");
   2465       }
   2466 
   2467       res = LLVMBuildFAdd(builder, a, half, "");
   2468    }
   2469 
   2470    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
   2471 
   2472    return res;
   2473 }
   2474 
   2475 
   2476 /**
   2477  * Return floor of float (vector), result is an int (vector)
   2478  * Ex: ifloor(1.1) = 1.0
   2479  * Ex: ifloor(-1.1) = -2.0
   2480  */
   2481 LLVMValueRef
   2482 lp_build_ifloor(struct lp_build_context *bld,
   2483                 LLVMValueRef a)
   2484 {
   2485    LLVMBuilderRef builder = bld->gallivm->builder;
   2486    const struct lp_type type = bld->type;
   2487    LLVMTypeRef int_vec_type = bld->int_vec_type;
   2488    LLVMValueRef res;
   2489 
   2490    assert(type.floating);
   2491    assert(lp_check_value(type, a));
   2492 
   2493    res = a;
   2494    if (type.sign) {
   2495       if (arch_rounding_available(type)) {
   2496          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
   2497       }
   2498       else {
   2499          struct lp_type inttype;
   2500          struct lp_build_context intbld;
   2501          LLVMValueRef trunc, itrunc, mask;
   2502 
   2503          assert(type.floating);
   2504          assert(lp_check_value(type, a));
   2505 
   2506          inttype = type;
   2507          inttype.floating = 0;
   2508          lp_build_context_init(&intbld, bld->gallivm, inttype);
   2509 
   2510          /* round by truncation */
   2511          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2512          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
   2513 
   2514          /*
   2515           * fix values if rounding is wrong (for non-special cases)
   2516           * - this is the case if trunc > a
   2517           * The results of doing this with NaNs, very large values etc.
   2518           * are undefined but this seems to be the case anyway.
   2519           */
   2520          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
   2521          /* cheapie minus one with mask since the mask is minus one / zero */
   2522          return lp_build_add(&intbld, itrunc, mask);
   2523       }
   2524    }
   2525 
   2526    /* round to nearest (toward zero) */
   2527    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
   2528 
   2529    return res;
   2530 }
   2531 
   2532 
   2533 /**
   2534  * Return ceiling of float (vector), returning int (vector).
   2535  * Ex: iceil( 1.1) = 2
   2536  * Ex: iceil(-1.1) = -1
   2537  */
   2538 LLVMValueRef
   2539 lp_build_iceil(struct lp_build_context *bld,
   2540                LLVMValueRef a)
   2541 {
   2542    LLVMBuilderRef builder = bld->gallivm->builder;
   2543    const struct lp_type type = bld->type;
   2544    LLVMTypeRef int_vec_type = bld->int_vec_type;
   2545    LLVMValueRef res;
   2546 
   2547    assert(type.floating);
   2548    assert(lp_check_value(type, a));
   2549 
   2550    if (arch_rounding_available(type)) {
   2551       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
   2552    }
   2553    else {
   2554       struct lp_type inttype;
   2555       struct lp_build_context intbld;
   2556       LLVMValueRef trunc, itrunc, mask;
   2557 
   2558       assert(type.floating);
   2559       assert(lp_check_value(type, a));
   2560 
   2561       inttype = type;
   2562       inttype.floating = 0;
   2563       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2564 
   2565       /* round by truncation */
   2566       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2567       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
   2568 
   2569       /*
   2570        * fix values if rounding is wrong (for non-special cases)
   2571        * - this is the case if trunc < a
   2572        * The results of doing this with NaNs, very large values etc.
   2573        * are undefined but this seems to be the case anyway.
   2574        */
   2575       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
   2576       /* cheapie plus one with mask since the mask is minus one / zero */
   2577       return lp_build_sub(&intbld, itrunc, mask);
   2578    }
   2579 
   2580    /* round to nearest (toward zero) */
   2581    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
   2582 
   2583    return res;
   2584 }
   2585 
   2586 
   2587 /**
   2588  * Combined ifloor() & fract().
   2589  *
   2590  * Preferred to calling the functions separately, as it will ensure that the
   2591  * strategy (floor() vs ifloor()) that results in less redundant work is used.
   2592  */
   2593 void
   2594 lp_build_ifloor_fract(struct lp_build_context *bld,
   2595                       LLVMValueRef a,
   2596                       LLVMValueRef *out_ipart,
   2597                       LLVMValueRef *out_fpart)
   2598 {
   2599    LLVMBuilderRef builder = bld->gallivm->builder;
   2600    const struct lp_type type = bld->type;
   2601    LLVMValueRef ipart;
   2602 
   2603    assert(type.floating);
   2604    assert(lp_check_value(type, a));
   2605 
   2606    if (arch_rounding_available(type)) {
   2607       /*
   2608        * floor() is easier.
   2609        */
   2610 
   2611       ipart = lp_build_floor(bld, a);
   2612       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
   2613       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
   2614    }
   2615    else {
   2616       /*
   2617        * ifloor() is easier.
   2618        */
   2619 
   2620       *out_ipart = lp_build_ifloor(bld, a);
   2621       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
   2622       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
   2623    }
   2624 }
   2625 
   2626 
   2627 /**
   2628  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
   2629  * always smaller than one.
   2630  */
   2631 void
   2632 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
   2633                            LLVMValueRef a,
   2634                            LLVMValueRef *out_ipart,
   2635                            LLVMValueRef *out_fpart)
   2636 {
   2637    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
   2638    *out_fpart = clamp_fract(bld, *out_fpart);
   2639 }
   2640 
   2641 
   2642 LLVMValueRef
   2643 lp_build_sqrt(struct lp_build_context *bld,
   2644               LLVMValueRef a)
   2645 {
   2646    LLVMBuilderRef builder = bld->gallivm->builder;
   2647    const struct lp_type type = bld->type;
   2648    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   2649    char intrinsic[32];
   2650 
   2651    assert(lp_check_value(type, a));
   2652 
   2653    assert(type.floating);
   2654    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
   2655 
   2656    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   2657 }
   2658 
   2659 
   2660 /**
   2661  * Do one Newton-Raphson step to improve reciprocate precision:
   2662  *
   2663  *   x_{i+1} = x_i * (2 - a * x_i)
   2664  *
   2665  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
   2666  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
   2667  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
   2668  * halo. It would be necessary to clamp the argument to prevent this.
   2669  *
   2670  * See also:
   2671  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
   2672  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
   2673  */
   2674 static inline LLVMValueRef
   2675 lp_build_rcp_refine(struct lp_build_context *bld,
   2676                     LLVMValueRef a,
   2677                     LLVMValueRef rcp_a)
   2678 {
   2679    LLVMBuilderRef builder = bld->gallivm->builder;
   2680    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
   2681    LLVMValueRef res;
   2682 
   2683    res = LLVMBuildFMul(builder, a, rcp_a, "");
   2684    res = LLVMBuildFSub(builder, two, res, "");
   2685    res = LLVMBuildFMul(builder, rcp_a, res, "");
   2686 
   2687    return res;
   2688 }
   2689 
   2690 
   2691 LLVMValueRef
   2692 lp_build_rcp(struct lp_build_context *bld,
   2693              LLVMValueRef a)
   2694 {
   2695    LLVMBuilderRef builder = bld->gallivm->builder;
   2696    const struct lp_type type = bld->type;
   2697 
   2698    assert(lp_check_value(type, a));
   2699 
   2700    if(a == bld->zero)
   2701       return bld->undef;
   2702    if(a == bld->one)
   2703       return bld->one;
   2704    if(a == bld->undef)
   2705       return bld->undef;
   2706 
   2707    assert(type.floating);
   2708 
   2709    if(LLVMIsConstant(a))
   2710       return LLVMConstFDiv(bld->one, a);
   2711 
   2712    /*
   2713     * We don't use RCPPS because:
   2714     * - it only has 10bits of precision
   2715     * - it doesn't even get the reciprocate of 1.0 exactly
   2716     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
   2717     * - for recent processors the benefit over DIVPS is marginal, a case
   2718     *   dependent
   2719     *
   2720     * We could still use it on certain processors if benchmarks show that the
   2721     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
   2722     * particular uses that require less workarounds.
   2723     */
   2724 
   2725    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   2726          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
   2727       const unsigned num_iterations = 0;
   2728       LLVMValueRef res;
   2729       unsigned i;
   2730       const char *intrinsic = NULL;
   2731 
   2732       if (type.length == 4) {
   2733          intrinsic = "llvm.x86.sse.rcp.ps";
   2734       }
   2735       else {
   2736          intrinsic = "llvm.x86.avx.rcp.ps.256";
   2737       }
   2738 
   2739       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2740 
   2741       for (i = 0; i < num_iterations; ++i) {
   2742          res = lp_build_rcp_refine(bld, a, res);
   2743       }
   2744 
   2745       return res;
   2746    }
   2747 
   2748    return LLVMBuildFDiv(builder, bld->one, a, "");
   2749 }
   2750 
   2751 
   2752 /**
   2753  * Do one Newton-Raphson step to improve rsqrt precision:
   2754  *
   2755  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
   2756  *
   2757  * See also Intel 64 and IA-32 Architectures Optimization Manual.
   2758  */
   2759 static inline LLVMValueRef
   2760 lp_build_rsqrt_refine(struct lp_build_context *bld,
   2761                       LLVMValueRef a,
   2762                       LLVMValueRef rsqrt_a)
   2763 {
   2764    LLVMBuilderRef builder = bld->gallivm->builder;
   2765    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
   2766    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
   2767    LLVMValueRef res;
   2768 
   2769    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
   2770    res = LLVMBuildFMul(builder, a, res, "");
   2771    res = LLVMBuildFSub(builder, three, res, "");
   2772    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
   2773    res = LLVMBuildFMul(builder, half, res, "");
   2774 
   2775    return res;
   2776 }
   2777 
   2778 
   2779 /**
   2780  * Generate 1/sqrt(a).
   2781  * Result is undefined for values < 0, infinity for +0.
   2782  */
   2783 LLVMValueRef
   2784 lp_build_rsqrt(struct lp_build_context *bld,
   2785                LLVMValueRef a)
   2786 {
   2787    const struct lp_type type = bld->type;
   2788 
   2789    assert(lp_check_value(type, a));
   2790 
   2791    assert(type.floating);
   2792 
   2793    /*
   2794     * This should be faster but all denormals will end up as infinity.
   2795     */
   2796    if (0 && lp_build_fast_rsqrt_available(type)) {
   2797       const unsigned num_iterations = 1;
   2798       LLVMValueRef res;
   2799       unsigned i;
   2800 
   2801       /* rsqrt(1.0) != 1.0 here */
   2802       res = lp_build_fast_rsqrt(bld, a);
   2803 
   2804       if (num_iterations) {
   2805          /*
   2806           * Newton-Raphson will result in NaN instead of infinity for zero,
   2807           * and NaN instead of zero for infinity.
   2808           * Also, need to ensure rsqrt(1.0) == 1.0.
   2809           * All numbers smaller than FLT_MIN will result in +infinity
   2810           * (rsqrtps treats all denormals as zero).
   2811           */
   2812          LLVMValueRef cmp;
   2813          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
   2814          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
   2815 
   2816          for (i = 0; i < num_iterations; ++i) {
   2817             res = lp_build_rsqrt_refine(bld, a, res);
   2818          }
   2819          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
   2820          res = lp_build_select(bld, cmp, inf, res);
   2821          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
   2822          res = lp_build_select(bld, cmp, bld->zero, res);
   2823          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
   2824          res = lp_build_select(bld, cmp, bld->one, res);
   2825       }
   2826 
   2827       return res;
   2828    }
   2829 
   2830    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
   2831 }
   2832 
   2833 /**
   2834  * If there's a fast (inaccurate) rsqrt instruction available
   2835  * (caller may want to avoid to call rsqrt_fast if it's not available,
   2836  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
   2837  * unavailable it would result in sqrt/div/mul so obviously
   2838  * much better to just call sqrt, skipping both div and mul).
   2839  */
   2840 boolean
   2841 lp_build_fast_rsqrt_available(struct lp_type type)
   2842 {
   2843    assert(type.floating);
   2844 
   2845    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   2846        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
   2847       return true;
   2848    }
   2849    return false;
   2850 }
   2851 
   2852 
   2853 /**
   2854  * Generate 1/sqrt(a).
   2855  * Result is undefined for values < 0, infinity for +0.
   2856  * Precision is limited, only ~10 bits guaranteed
   2857  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
   2858  */
   2859 LLVMValueRef
   2860 lp_build_fast_rsqrt(struct lp_build_context *bld,
   2861                     LLVMValueRef a)
   2862 {
   2863    LLVMBuilderRef builder = bld->gallivm->builder;
   2864    const struct lp_type type = bld->type;
   2865 
   2866    assert(lp_check_value(type, a));
   2867 
   2868    if (lp_build_fast_rsqrt_available(type)) {
   2869       const char *intrinsic = NULL;
   2870 
   2871       if (type.length == 4) {
   2872          intrinsic = "llvm.x86.sse.rsqrt.ps";
   2873       }
   2874       else {
   2875          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
   2876       }
   2877       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2878    }
   2879    else {
   2880       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
   2881    }
   2882    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
   2883 }
   2884 
   2885 
   2886 /**
   2887  * Generate sin(a) or cos(a) using polynomial approximation.
   2888  * TODO: it might be worth recognizing sin and cos using same source
   2889  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
   2890  * would be way cheaper than calculating (nearly) everything twice...
   2891  * Not sure it's common enough to be worth bothering however, scs
   2892  * opcode could also benefit from calculating both though.
   2893  */
   2894 static LLVMValueRef
   2895 lp_build_sin_or_cos(struct lp_build_context *bld,
   2896                     LLVMValueRef a,
   2897                     boolean cos)
   2898 {
   2899    struct gallivm_state *gallivm = bld->gallivm;
   2900    LLVMBuilderRef b = gallivm->builder;
   2901    struct lp_type int_type = lp_int_type(bld->type);
   2902 
   2903    /*
   2904     *  take the absolute value,
   2905     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
   2906     */
   2907 
   2908    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
   2909    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
   2910 
   2911    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
   2912    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
   2913 
   2914    /*
   2915     * scale by 4/Pi
   2916     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
   2917     */
   2918 
   2919    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
   2920    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
   2921 
   2922    /*
   2923     * store the integer part of y in mm0
   2924     * emm2 = _mm_cvttps_epi32(y);
   2925     */
   2926 
   2927    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
   2928 
   2929    /*
   2930     * j=(j+1) & (~1) (see the cephes sources)
   2931     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
   2932     */
   2933 
   2934    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
   2935    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
   2936    /*
   2937     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
   2938     */
   2939    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
   2940    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
   2941 
   2942    /*
   2943     * y = _mm_cvtepi32_ps(emm2);
   2944     */
   2945    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
   2946 
   2947    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
   2948    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
   2949    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
   2950    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
   2951 
   2952    /*
   2953     * Argument used for poly selection and sign bit determination
   2954     * is different for sin vs. cos.
   2955     */
   2956    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
   2957                                emm2_and;
   2958 
   2959    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
   2960                                                               LLVMBuildNot(b, emm2_2, ""), ""),
   2961                                               const_29, "sign_bit") :
   2962                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
   2963                                                               LLVMBuildShl(b, emm2_add,
   2964                                                                            const_29, ""), ""),
   2965                                               sign_mask, "sign_bit");
   2966 
   2967    /*
   2968     * get the polynom selection mask
   2969     * there is one polynom for 0 <= x <= Pi/4
   2970     * and another one for Pi/4<x<=Pi/2
   2971     * Both branches will be computed.
   2972     *
   2973     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
   2974     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   2975     */
   2976 
   2977    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
   2978    LLVMValueRef poly_mask = lp_build_compare(gallivm,
   2979                                              int_type, PIPE_FUNC_EQUAL,
   2980                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
   2981 
   2982    /*
   2983     * _PS_CONST(minus_cephes_DP1, -0.78515625);
   2984     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
   2985     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
   2986     */
   2987    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
   2988    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
   2989    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
   2990 
   2991    /*
   2992     * The magic pass: "Extended precision modular arithmetic"
   2993     * x = ((x - y * DP1) - y * DP2) - y * DP3;
   2994     */
   2995    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
   2996    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
   2997    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
   2998 
   2999    /*
   3000     * Evaluate the first polynom  (0 <= x <= Pi/4)
   3001     *
   3002     * z = _mm_mul_ps(x,x);
   3003     */
   3004    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
   3005 
   3006    /*
   3007     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
   3008     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
   3009     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
   3010     */
   3011    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
   3012    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
   3013    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
   3014 
   3015    /*
   3016     * y = *(v4sf*)_ps_coscof_p0;
   3017     * y = _mm_mul_ps(y, z);
   3018     */
   3019    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
   3020    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
   3021    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
   3022    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
   3023 
   3024 
   3025    /*
   3026     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
   3027     * y = _mm_sub_ps(y, tmp);
   3028     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
   3029     */
   3030    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
   3031    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
   3032    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
   3033    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
   3034    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
   3035 
   3036    /*
   3037     * _PS_CONST(sincof_p0, -1.9515295891E-4);
   3038     * _PS_CONST(sincof_p1,  8.3321608736E-3);
   3039     * _PS_CONST(sincof_p2, -1.6666654611E-1);
   3040     */
   3041    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
   3042    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
   3043    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
   3044 
   3045    /*
   3046     * Evaluate the second polynom  (Pi/4 <= x <= 0)
   3047     *
   3048     * y2 = *(v4sf*)_ps_sincof_p0;
   3049     * y2 = _mm_mul_ps(y2, z);
   3050     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
   3051     * y2 = _mm_mul_ps(y2, z);
   3052     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
   3053     * y2 = _mm_mul_ps(y2, z);
   3054     * y2 = _mm_mul_ps(y2, x);
   3055     * y2 = _mm_add_ps(y2, x);
   3056     */
   3057 
   3058    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
   3059    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
   3060    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
   3061    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
   3062 
   3063    /*
   3064     * select the correct result from the two polynoms
   3065     * xmm3 = poly_mask;
   3066     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   3067     * y = _mm_andnot_ps(xmm3, y);
   3068     * y = _mm_or_ps(y,y2);
   3069     */
   3070    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
   3071    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
   3072    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
   3073    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
   3074    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
   3075    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
   3076 
   3077    /*
   3078     * update the sign
   3079     * y = _mm_xor_ps(y, sign_bit);
   3080     */
   3081    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
   3082    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
   3083 
   3084    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
   3085 
   3086    /* clamp output to be within [-1, 1] */
   3087    y_result = lp_build_clamp(bld, y_result,
   3088                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
   3089                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
   3090    /* If a is -inf, inf or NaN then return NaN */
   3091    y_result = lp_build_select(bld, isfinite, y_result,
   3092                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
   3093    return y_result;
   3094 }
   3095 
   3096 
   3097 /**
   3098  * Generate sin(a)
   3099  */
   3100 LLVMValueRef
   3101 lp_build_sin(struct lp_build_context *bld,
   3102              LLVMValueRef a)
   3103 {
   3104    return lp_build_sin_or_cos(bld, a, FALSE);
   3105 }
   3106 
   3107 
   3108 /**
   3109  * Generate cos(a)
   3110  */
   3111 LLVMValueRef
   3112 lp_build_cos(struct lp_build_context *bld,
   3113              LLVMValueRef a)
   3114 {
   3115    return lp_build_sin_or_cos(bld, a, TRUE);
   3116 }
   3117 
   3118 
   3119 /**
   3120  * Generate pow(x, y)
   3121  */
   3122 LLVMValueRef
   3123 lp_build_pow(struct lp_build_context *bld,
   3124              LLVMValueRef x,
   3125              LLVMValueRef y)
   3126 {
   3127    /* TODO: optimize the constant case */
   3128    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3129        LLVMIsConstant(x) && LLVMIsConstant(y)) {
   3130       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3131                    __FUNCTION__);
   3132    }
   3133 
   3134    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
   3135 }
   3136 
   3137 
   3138 /**
   3139  * Generate exp(x)
   3140  */
   3141 LLVMValueRef
   3142 lp_build_exp(struct lp_build_context *bld,
   3143              LLVMValueRef x)
   3144 {
   3145    /* log2(e) = 1/log(2) */
   3146    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
   3147                                            1.4426950408889634);
   3148 
   3149    assert(lp_check_value(bld->type, x));
   3150 
   3151    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
   3152 }
   3153 
   3154 
   3155 /**
   3156  * Generate log(x)
   3157  * Behavior is undefined with infs, 0s and nans
   3158  */
   3159 LLVMValueRef
   3160 lp_build_log(struct lp_build_context *bld,
   3161              LLVMValueRef x)
   3162 {
   3163    /* log(2) */
   3164    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
   3165                                           0.69314718055994529);
   3166 
   3167    assert(lp_check_value(bld->type, x));
   3168 
   3169    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
   3170 }
   3171 
   3172 /**
   3173  * Generate log(x) that handles edge cases (infs, 0s and nans)
   3174  */
   3175 LLVMValueRef
   3176 lp_build_log_safe(struct lp_build_context *bld,
   3177                   LLVMValueRef x)
   3178 {
   3179    /* log(2) */
   3180    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
   3181                                           0.69314718055994529);
   3182 
   3183    assert(lp_check_value(bld->type, x));
   3184 
   3185    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
   3186 }
   3187 
   3188 
   3189 /**
   3190  * Generate polynomial.
   3191  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
   3192  */
   3193 LLVMValueRef
   3194 lp_build_polynomial(struct lp_build_context *bld,
   3195                     LLVMValueRef x,
   3196                     const double *coeffs,
   3197                     unsigned num_coeffs)
   3198 {
   3199    const struct lp_type type = bld->type;
   3200    LLVMValueRef even = NULL, odd = NULL;
   3201    LLVMValueRef x2;
   3202    unsigned i;
   3203 
   3204    assert(lp_check_value(bld->type, x));
   3205 
   3206    /* TODO: optimize the constant case */
   3207    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3208        LLVMIsConstant(x)) {
   3209       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3210                    __FUNCTION__);
   3211    }
   3212 
   3213    /*
   3214     * Calculate odd and even terms seperately to decrease data dependency
   3215     * Ex:
   3216     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
   3217     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
   3218     */
   3219    x2 = lp_build_mul(bld, x, x);
   3220 
   3221    for (i = num_coeffs; i--; ) {
   3222       LLVMValueRef coeff;
   3223 
   3224       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
   3225 
   3226       if (i % 2 == 0) {
   3227          if (even)
   3228             even = lp_build_mad(bld, x2, even, coeff);
   3229          else
   3230             even = coeff;
   3231       } else {
   3232          if (odd)
   3233             odd = lp_build_mad(bld, x2, odd, coeff);
   3234          else
   3235             odd = coeff;
   3236       }
   3237    }
   3238 
   3239    if (odd)
   3240       return lp_build_mad(bld, odd, x, even);
   3241    else if (even)
   3242       return even;
   3243    else
   3244       return bld->undef;
   3245 }
   3246 
   3247 
   3248 /**
   3249  * Minimax polynomial fit of 2**x, in range [0, 1[
   3250  */
   3251 const double lp_build_exp2_polynomial[] = {
   3252 #if EXP_POLY_DEGREE == 5
   3253    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
   3254    0.693153073200168932794,
   3255    0.240153617044375388211,
   3256    0.0558263180532956664775,
   3257    0.00898934009049466391101,
   3258    0.00187757667519147912699
   3259 #elif EXP_POLY_DEGREE == 4
   3260    1.00000259337069434683,
   3261    0.693003834469974940458,
   3262    0.24144275689150793076,
   3263    0.0520114606103070150235,
   3264    0.0135341679161270268764
   3265 #elif EXP_POLY_DEGREE == 3
   3266    0.999925218562710312959,
   3267    0.695833540494823811697,
   3268    0.226067155427249155588,
   3269    0.0780245226406372992967
   3270 #elif EXP_POLY_DEGREE == 2
   3271    1.00172476321474503578,
   3272    0.657636275736077639316,
   3273    0.33718943461968720704
   3274 #else
   3275 #error
   3276 #endif
   3277 };
   3278 
   3279 
   3280 LLVMValueRef
   3281 lp_build_exp2(struct lp_build_context *bld,
   3282               LLVMValueRef x)
   3283 {
   3284    LLVMBuilderRef builder = bld->gallivm->builder;
   3285    const struct lp_type type = bld->type;
   3286    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   3287    LLVMValueRef ipart = NULL;
   3288    LLVMValueRef fpart = NULL;
   3289    LLVMValueRef expipart = NULL;
   3290    LLVMValueRef expfpart = NULL;
   3291    LLVMValueRef res = NULL;
   3292 
   3293    assert(lp_check_value(bld->type, x));
   3294 
   3295    /* TODO: optimize the constant case */
   3296    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3297        LLVMIsConstant(x)) {
   3298       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3299                    __FUNCTION__);
   3300    }
   3301 
   3302    assert(type.floating && type.width == 32);
   3303 
   3304    /* We want to preserve NaN and make sure than for exp2 if x > 128,
   3305     * the result is INF  and if it's smaller than -126.9 the result is 0 */
   3306    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
   3307                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
   3308    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
   3309                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
   3310 
   3311    /* ipart = floor(x) */
   3312    /* fpart = x - ipart */
   3313    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
   3314 
   3315    /* expipart = (float) (1 << ipart) */
   3316    expipart = LLVMBuildAdd(builder, ipart,
   3317                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
   3318    expipart = LLVMBuildShl(builder, expipart,
   3319                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
   3320    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
   3321 
   3322    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
   3323                                   ARRAY_SIZE(lp_build_exp2_polynomial));
   3324 
   3325    res = LLVMBuildFMul(builder, expipart, expfpart, "");
   3326 
   3327    return res;
   3328 }
   3329 
   3330 
   3331 
   3332 /**
   3333  * Extract the exponent of a IEEE-754 floating point value.
   3334  *
   3335  * Optionally apply an integer bias.
   3336  *
   3337  * Result is an integer value with
   3338  *
   3339  *   ifloor(log2(x)) + bias
   3340  */
   3341 LLVMValueRef
   3342 lp_build_extract_exponent(struct lp_build_context *bld,
   3343                           LLVMValueRef x,
   3344                           int bias)
   3345 {
   3346    LLVMBuilderRef builder = bld->gallivm->builder;
   3347    const struct lp_type type = bld->type;
   3348    unsigned mantissa = lp_mantissa(type);
   3349    LLVMValueRef res;
   3350 
   3351    assert(type.floating);
   3352 
   3353    assert(lp_check_value(bld->type, x));
   3354 
   3355    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
   3356 
   3357    res = LLVMBuildLShr(builder, x,
   3358                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
   3359    res = LLVMBuildAnd(builder, res,
   3360                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
   3361    res = LLVMBuildSub(builder, res,
   3362                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
   3363 
   3364    return res;
   3365 }
   3366 
   3367 
   3368 /**
   3369  * Extract the mantissa of the a floating.
   3370  *
   3371  * Result is a floating point value with
   3372  *
   3373  *   x / floor(log2(x))
   3374  */
   3375 LLVMValueRef
   3376 lp_build_extract_mantissa(struct lp_build_context *bld,
   3377                           LLVMValueRef x)
   3378 {
   3379    LLVMBuilderRef builder = bld->gallivm->builder;
   3380    const struct lp_type type = bld->type;
   3381    unsigned mantissa = lp_mantissa(type);
   3382    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
   3383                                                   (1ULL << mantissa) - 1);
   3384    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
   3385    LLVMValueRef res;
   3386 
   3387    assert(lp_check_value(bld->type, x));
   3388 
   3389    assert(type.floating);
   3390 
   3391    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
   3392 
   3393    /* res = x / 2**ipart */
   3394    res = LLVMBuildAnd(builder, x, mantmask, "");
   3395    res = LLVMBuildOr(builder, res, one, "");
   3396    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
   3397 
   3398    return res;
   3399 }
   3400 
   3401 
   3402 
   3403 /**
   3404  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
   3405  * These coefficients can be generate with
   3406  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
   3407  */
   3408 const double lp_build_log2_polynomial[] = {
   3409 #if LOG_POLY_DEGREE == 5
   3410    2.88539008148777786488L,
   3411    0.961796878841293367824L,
   3412    0.577058946784739859012L,
   3413    0.412914355135828735411L,
   3414    0.308591899232910175289L,
   3415    0.352376952300281371868L,
   3416 #elif LOG_POLY_DEGREE == 4
   3417    2.88539009343309178325L,
   3418    0.961791550404184197881L,
   3419    0.577440339438736392009L,
   3420    0.403343858251329912514L,
   3421    0.406718052498846252698L,
   3422 #elif LOG_POLY_DEGREE == 3
   3423    2.88538959748872753838L,
   3424    0.961932915889597772928L,
   3425    0.571118517972136195241L,
   3426    0.493997535084709500285L,
   3427 #else
   3428 #error
   3429 #endif
   3430 };
   3431 
   3432 /**
   3433  * See http://www.devmaster.net/forums/showthread.php?p=43580
   3434  * http://en.wikipedia.org/wiki/Logarithm#Calculation
   3435  * http://www.nezumi.demon.co.uk/consult/logx.htm
   3436  *
   3437  * If handle_edge_cases is true the function will perform computations
   3438  * to match the required D3D10+ behavior for each of the edge cases.
   3439  * That means that if input is:
   3440  * - less than zero (to and including -inf) then NaN will be returned
   3441  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
   3442  * - +infinity, then +infinity will be returned
   3443  * - NaN, then NaN will be returned
   3444  *
   3445  * Those checks are fairly expensive so if you don't need them make sure
   3446  * handle_edge_cases is false.
   3447  */
   3448 void
   3449 lp_build_log2_approx(struct lp_build_context *bld,
   3450                      LLVMValueRef x,
   3451                      LLVMValueRef *p_exp,
   3452                      LLVMValueRef *p_floor_log2,
   3453                      LLVMValueRef *p_log2,
   3454                      boolean handle_edge_cases)
   3455 {
   3456    LLVMBuilderRef builder = bld->gallivm->builder;
   3457    const struct lp_type type = bld->type;
   3458    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   3459    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   3460 
   3461    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
   3462    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
   3463    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
   3464 
   3465    LLVMValueRef i = NULL;
   3466    LLVMValueRef y = NULL;
   3467    LLVMValueRef z = NULL;
   3468    LLVMValueRef exp = NULL;
   3469    LLVMValueRef mant = NULL;
   3470    LLVMValueRef logexp = NULL;
   3471    LLVMValueRef p_z = NULL;
   3472    LLVMValueRef res = NULL;
   3473 
   3474    assert(lp_check_value(bld->type, x));
   3475 
   3476    if(p_exp || p_floor_log2 || p_log2) {
   3477       /* TODO: optimize the constant case */
   3478       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3479           LLVMIsConstant(x)) {
   3480          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3481                       __FUNCTION__);
   3482       }
   3483 
   3484       assert(type.floating && type.width == 32);
   3485 
   3486       /*
   3487        * We don't explicitly handle denormalized numbers. They will yield a
   3488        * result in the neighbourhood of -127, which appears to be adequate
   3489        * enough.
   3490        */
   3491 
   3492       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
   3493 
   3494       /* exp = (float) exponent(x) */
   3495       exp = LLVMBuildAnd(builder, i, expmask, "");
   3496    }
   3497 
   3498    if(p_floor_log2 || p_log2) {
   3499       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
   3500       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
   3501       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
   3502    }
   3503 
   3504    if (p_log2) {
   3505       /* mant = 1 + (float) mantissa(x) */
   3506       mant = LLVMBuildAnd(builder, i, mantmask, "");
   3507       mant = LLVMBuildOr(builder, mant, one, "");
   3508       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
   3509 
   3510       /* y = (mant - 1) / (mant + 1) */
   3511       y = lp_build_div(bld,
   3512          lp_build_sub(bld, mant, bld->one),
   3513          lp_build_add(bld, mant, bld->one)
   3514       );
   3515 
   3516       /* z = y^2 */
   3517       z = lp_build_mul(bld, y, y);
   3518 
   3519       /* compute P(z) */
   3520       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
   3521                                 ARRAY_SIZE(lp_build_log2_polynomial));
   3522 
   3523       /* y * P(z) + logexp */
   3524       res = lp_build_mad(bld, y, p_z, logexp);
   3525 
   3526       if (type.floating && handle_edge_cases) {
   3527          LLVMValueRef negmask, infmask,  zmask;
   3528          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
   3529                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
   3530          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
   3531                               lp_build_const_vec(bld->gallivm, type,  0.0f));
   3532          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
   3533                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
   3534 
   3535          /* If x is qual to inf make sure we return inf */
   3536          res = lp_build_select(bld, infmask,
   3537                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
   3538                                res);
   3539          /* If x is qual to 0, return -inf */
   3540          res = lp_build_select(bld, zmask,
   3541                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
   3542                                res);
   3543          /* If x is nan or less than 0, return nan */
   3544          res = lp_build_select(bld, negmask,
   3545                                lp_build_const_vec(bld->gallivm, type,  NAN),
   3546                                res);
   3547       }
   3548    }
   3549 
   3550    if (p_exp) {
   3551       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
   3552       *p_exp = exp;
   3553    }
   3554 
   3555    if (p_floor_log2)
   3556       *p_floor_log2 = logexp;
   3557 
   3558    if (p_log2)
   3559       *p_log2 = res;
   3560 }
   3561 
   3562 
   3563 /*
   3564  * log2 implementation which doesn't have special code to
   3565  * handle edge cases (-inf, 0, inf, NaN). It's faster but
   3566  * the results for those cases are undefined.
   3567  */
   3568 LLVMValueRef
   3569 lp_build_log2(struct lp_build_context *bld,
   3570               LLVMValueRef x)
   3571 {
   3572    LLVMValueRef res;
   3573    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
   3574    return res;
   3575 }
   3576 
   3577 /*
   3578  * Version of log2 which handles all edge cases.
   3579  * Look at documentation of lp_build_log2_approx for
   3580  * description of the behavior for each of the edge cases.
   3581  */
   3582 LLVMValueRef
   3583 lp_build_log2_safe(struct lp_build_context *bld,
   3584                    LLVMValueRef x)
   3585 {
   3586    LLVMValueRef res;
   3587    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
   3588    return res;
   3589 }
   3590 
   3591 
   3592 /**
   3593  * Faster (and less accurate) log2.
   3594  *
   3595  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
   3596  *
   3597  * Piece-wise linear approximation, with exact results when x is a
   3598  * power of two.
   3599  *
   3600  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
   3601  */
   3602 LLVMValueRef
   3603 lp_build_fast_log2(struct lp_build_context *bld,
   3604                    LLVMValueRef x)
   3605 {
   3606    LLVMBuilderRef builder = bld->gallivm->builder;
   3607    LLVMValueRef ipart;
   3608    LLVMValueRef fpart;
   3609 
   3610    assert(lp_check_value(bld->type, x));
   3611 
   3612    assert(bld->type.floating);
   3613 
   3614    /* ipart = floor(log2(x)) - 1 */
   3615    ipart = lp_build_extract_exponent(bld, x, -1);
   3616    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
   3617 
   3618    /* fpart = x / 2**ipart */
   3619    fpart = lp_build_extract_mantissa(bld, x);
   3620 
   3621    /* ipart + fpart */
   3622    return LLVMBuildFAdd(builder, ipart, fpart, "");
   3623 }
   3624 
   3625 
   3626 /**
   3627  * Fast implementation of iround(log2(x)).
   3628  *
   3629  * Not an approximation -- it should give accurate results all the time.
   3630  */
   3631 LLVMValueRef
   3632 lp_build_ilog2(struct lp_build_context *bld,
   3633                LLVMValueRef x)
   3634 {
   3635    LLVMBuilderRef builder = bld->gallivm->builder;
   3636    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
   3637    LLVMValueRef ipart;
   3638 
   3639    assert(bld->type.floating);
   3640 
   3641    assert(lp_check_value(bld->type, x));
   3642 
   3643    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
   3644    x = LLVMBuildFMul(builder, x, sqrt2, "");
   3645 
   3646    /* ipart = floor(log2(x) + 0.5)  */
   3647    ipart = lp_build_extract_exponent(bld, x, 0);
   3648 
   3649    return ipart;
   3650 }
   3651 
   3652 LLVMValueRef
   3653 lp_build_mod(struct lp_build_context *bld,
   3654              LLVMValueRef x,
   3655              LLVMValueRef y)
   3656 {
   3657    LLVMBuilderRef builder = bld->gallivm->builder;
   3658    LLVMValueRef res;
   3659    const struct lp_type type = bld->type;
   3660 
   3661    assert(lp_check_value(type, x));
   3662    assert(lp_check_value(type, y));
   3663 
   3664    if (type.floating)
   3665       res = LLVMBuildFRem(builder, x, y, "");
   3666    else if (type.sign)
   3667       res = LLVMBuildSRem(builder, x, y, "");
   3668    else
   3669       res = LLVMBuildURem(builder, x, y, "");
   3670    return res;
   3671 }
   3672 
   3673 
   3674 /*
   3675  * For floating inputs it creates and returns a mask
   3676  * which is all 1's for channels which are NaN.
   3677  * Channels inside x which are not NaN will be 0.
   3678  */
   3679 LLVMValueRef
   3680 lp_build_isnan(struct lp_build_context *bld,
   3681                LLVMValueRef x)
   3682 {
   3683    LLVMValueRef mask;
   3684    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
   3685 
   3686    assert(bld->type.floating);
   3687    assert(lp_check_value(bld->type, x));
   3688 
   3689    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
   3690                         "isnotnan");
   3691    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
   3692    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
   3693    return mask;
   3694 }
   3695 
   3696 /* Returns all 1's for floating point numbers that are
   3697  * finite numbers and returns all zeros for -inf,
   3698  * inf and nan's */
   3699 LLVMValueRef
   3700 lp_build_isfinite(struct lp_build_context *bld,
   3701                   LLVMValueRef x)
   3702 {
   3703    LLVMBuilderRef builder = bld->gallivm->builder;
   3704    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
   3705    struct lp_type int_type = lp_int_type(bld->type);
   3706    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
   3707    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
   3708                                                     0x7f800000);
   3709 
   3710    if (!bld->type.floating) {
   3711       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
   3712    }
   3713    assert(bld->type.floating);
   3714    assert(lp_check_value(bld->type, x));
   3715    assert(bld->type.width == 32);
   3716 
   3717    intx = LLVMBuildAnd(builder, intx, infornan32, "");
   3718    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
   3719                            intx, infornan32);
   3720 }
   3721 
   3722 /*
   3723  * Returns true if the number is nan or inf and false otherwise.
   3724  * The input has to be a floating point vector.
   3725  */
   3726 LLVMValueRef
   3727 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
   3728                        const struct lp_type type,
   3729                        LLVMValueRef x)
   3730 {
   3731    LLVMBuilderRef builder = gallivm->builder;
   3732    struct lp_type int_type = lp_int_type(type);
   3733    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
   3734                                                 0x7f800000);
   3735    LLVMValueRef ret;
   3736 
   3737    assert(type.floating);
   3738 
   3739    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
   3740    ret = LLVMBuildAnd(builder, ret, const0, "");
   3741    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
   3742                           ret, const0);
   3743 
   3744    return ret;
   3745 }
   3746 
   3747 
   3748 LLVMValueRef
   3749 lp_build_fpstate_get(struct gallivm_state *gallivm)
   3750 {
   3751    if (util_cpu_caps.has_sse) {
   3752       LLVMBuilderRef builder = gallivm->builder;
   3753       LLVMValueRef mxcsr_ptr = lp_build_alloca(
   3754          gallivm,
   3755          LLVMInt32TypeInContext(gallivm->context),
   3756          "mxcsr_ptr");
   3757       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
   3758           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
   3759       lp_build_intrinsic(builder,
   3760                          "llvm.x86.sse.stmxcsr",
   3761                          LLVMVoidTypeInContext(gallivm->context),
   3762                          &mxcsr_ptr8, 1, 0);
   3763       return mxcsr_ptr;
   3764    }
   3765    return 0;
   3766 }
   3767 
   3768 void
   3769 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
   3770                                   boolean zero)
   3771 {
   3772    if (util_cpu_caps.has_sse) {
   3773       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
   3774       int daz_ftz = _MM_FLUSH_ZERO_MASK;
   3775 
   3776       LLVMBuilderRef builder = gallivm->builder;
   3777       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
   3778       LLVMValueRef mxcsr =
   3779          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
   3780 
   3781       if (util_cpu_caps.has_daz) {
   3782          /* Enable denormals are zero mode */
   3783          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
   3784       }
   3785       if (zero) {
   3786          mxcsr = LLVMBuildOr(builder, mxcsr,
   3787                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
   3788       } else {
   3789          mxcsr = LLVMBuildAnd(builder, mxcsr,
   3790                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
   3791       }
   3792 
   3793       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
   3794       lp_build_fpstate_set(gallivm, mxcsr_ptr);
   3795    }
   3796 }
   3797 
   3798 void
   3799 lp_build_fpstate_set(struct gallivm_state *gallivm,
   3800                      LLVMValueRef mxcsr_ptr)
   3801 {
   3802    if (util_cpu_caps.has_sse) {
   3803       LLVMBuilderRef builder = gallivm->builder;
   3804       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
   3805                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
   3806       lp_build_intrinsic(builder,
   3807                          "llvm.x86.sse.ldmxcsr",
   3808                          LLVMVoidTypeInContext(gallivm->context),
   3809                          &mxcsr_ptr, 1, 0);
   3810    }
   3811 }
   3812