Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009-2010 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 /**
     30  * @file
     31  * Helper
     32  *
     33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
     34  * notably min/max and saturated operations), and it is often necessary to
     35  * resort machine-specific intrinsics directly. The functions here hide all
     36  * these implementation details from the other modules.
     37  *
     38  * We also do simple expressions simplification here. Reasons are:
     39  * - it is very easy given we have all necessary information readily available
     40  * - LLVM optimization passes fail to simplify several vector expressions
     41  * - We often know value constraints which the optimization passes have no way
     42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
     43  *
     44  * @author Jose Fonseca <jfonseca (at) vmware.com>
     45  */
     46 
     47 
     48 #include "util/u_memory.h"
     49 #include "util/u_debug.h"
     50 #include "util/u_math.h"
     51 #include "util/u_string.h"
     52 #include "util/u_cpu_detect.h"
     53 
     54 #include "lp_bld_type.h"
     55 #include "lp_bld_const.h"
     56 #include "lp_bld_init.h"
     57 #include "lp_bld_intr.h"
     58 #include "lp_bld_logic.h"
     59 #include "lp_bld_pack.h"
     60 #include "lp_bld_debug.h"
     61 #include "lp_bld_arit.h"
     62 
     63 
     64 #define EXP_POLY_DEGREE 5
     65 
     66 #define LOG_POLY_DEGREE 4
     67 
     68 
     69 /**
     70  * Generate min(a, b)
     71  * No checks for special case values of a or b = 1 or 0 are done.
     72  */
     73 static LLVMValueRef
     74 lp_build_min_simple(struct lp_build_context *bld,
     75                     LLVMValueRef a,
     76                     LLVMValueRef b)
     77 {
     78    const struct lp_type type = bld->type;
     79    const char *intrinsic = NULL;
     80    unsigned intr_size = 0;
     81    LLVMValueRef cond;
     82 
     83    assert(lp_check_value(type, a));
     84    assert(lp_check_value(type, b));
     85 
     86    /* TODO: optimize the constant case */
     87 
     88    if (type.floating && util_cpu_caps.has_sse) {
     89       if (type.width == 32) {
     90          if (type.length == 1) {
     91             intrinsic = "llvm.x86.sse.min.ss";
     92             intr_size = 128;
     93          }
     94          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
     95             intrinsic = "llvm.x86.sse.min.ps";
     96             intr_size = 128;
     97          }
     98          else {
     99             intrinsic = "llvm.x86.avx.min.ps.256";
    100             intr_size = 256;
    101          }
    102       }
    103       if (type.width == 64 && util_cpu_caps.has_sse2) {
    104          if (type.length == 1) {
    105             intrinsic = "llvm.x86.sse2.min.sd";
    106             intr_size = 128;
    107          }
    108          else if (type.length == 2 || !util_cpu_caps.has_avx) {
    109             intrinsic = "llvm.x86.sse2.min.pd";
    110             intr_size = 128;
    111          }
    112          else {
    113             intrinsic = "llvm.x86.avx.min.pd.256";
    114             intr_size = 256;
    115          }
    116       }
    117    }
    118    else if (util_cpu_caps.has_sse2 && type.length >= 2) {
    119       intr_size = 128;
    120       if ((type.width == 8 || type.width == 16) &&
    121           (type.width * type.length <= 64) &&
    122           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
    123          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
    124                       __FUNCTION__);
    125          }
    126       if (type.width == 8 && !type.sign) {
    127          intrinsic = "llvm.x86.sse2.pminu.b";
    128       }
    129       else if (type.width == 16 && type.sign) {
    130          intrinsic = "llvm.x86.sse2.pmins.w";
    131       }
    132       if (util_cpu_caps.has_sse4_1) {
    133          if (type.width == 8 && type.sign) {
    134             intrinsic = "llvm.x86.sse41.pminsb";
    135          }
    136          if (type.width == 16 && !type.sign) {
    137             intrinsic = "llvm.x86.sse41.pminuw";
    138          }
    139          if (type.width == 32 && !type.sign) {
    140             intrinsic = "llvm.x86.sse41.pminud";
    141         }
    142          if (type.width == 32 && type.sign) {
    143             intrinsic = "llvm.x86.sse41.pminsd";
    144          }
    145       }
    146    }
    147 
    148    if(intrinsic) {
    149       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    150                                                  type,
    151                                                  intr_size, a, b);
    152    }
    153 
    154    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    155    return lp_build_select(bld, cond, a, b);
    156 }
    157 
    158 
    159 /**
    160  * Generate max(a, b)
    161  * No checks for special case values of a or b = 1 or 0 are done.
    162  */
    163 static LLVMValueRef
    164 lp_build_max_simple(struct lp_build_context *bld,
    165                     LLVMValueRef a,
    166                     LLVMValueRef b)
    167 {
    168    const struct lp_type type = bld->type;
    169    const char *intrinsic = NULL;
    170    unsigned intr_size = 0;
    171    LLVMValueRef cond;
    172 
    173    assert(lp_check_value(type, a));
    174    assert(lp_check_value(type, b));
    175 
    176    /* TODO: optimize the constant case */
    177 
    178    if (type.floating && util_cpu_caps.has_sse) {
    179       if (type.width == 32) {
    180          if (type.length == 1) {
    181             intrinsic = "llvm.x86.sse.max.ss";
    182             intr_size = 128;
    183          }
    184          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
    185             intrinsic = "llvm.x86.sse.max.ps";
    186             intr_size = 128;
    187          }
    188          else {
    189             intrinsic = "llvm.x86.avx.max.ps.256";
    190             intr_size = 256;
    191          }
    192       }
    193       if (type.width == 64 && util_cpu_caps.has_sse2) {
    194          if (type.length == 1) {
    195             intrinsic = "llvm.x86.sse2.max.sd";
    196             intr_size = 128;
    197          }
    198          else if (type.length == 2 || !util_cpu_caps.has_avx) {
    199             intrinsic = "llvm.x86.sse2.max.pd";
    200             intr_size = 128;
    201          }
    202          else {
    203             intrinsic = "llvm.x86.avx.max.pd.256";
    204             intr_size = 256;
    205          }
    206       }
    207    }
    208    else if (util_cpu_caps.has_sse2 && type.length >= 2) {
    209       intr_size = 128;
    210       if ((type.width == 8 || type.width == 16) &&
    211           (type.width * type.length <= 64) &&
    212           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
    213          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
    214                       __FUNCTION__);
    215          }
    216       if (type.width == 8 && !type.sign) {
    217          intrinsic = "llvm.x86.sse2.pmaxu.b";
    218          intr_size = 128;
    219       }
    220       else if (type.width == 16 && type.sign) {
    221          intrinsic = "llvm.x86.sse2.pmaxs.w";
    222       }
    223       if (util_cpu_caps.has_sse4_1) {
    224          if (type.width == 8 && type.sign) {
    225             intrinsic = "llvm.x86.sse41.pmaxsb";
    226          }
    227          if (type.width == 16 && !type.sign) {
    228             intrinsic = "llvm.x86.sse41.pmaxuw";
    229          }
    230          if (type.width == 32 && !type.sign) {
    231             intrinsic = "llvm.x86.sse41.pmaxud";
    232         }
    233          if (type.width == 32 && type.sign) {
    234             intrinsic = "llvm.x86.sse41.pmaxsd";
    235          }
    236       }
    237    }
    238 
    239    if(intrinsic) {
    240       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    241                                                  type,
    242                                                  intr_size, a, b);
    243    }
    244 
    245    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    246    return lp_build_select(bld, cond, a, b);
    247 }
    248 
    249 
    250 /**
    251  * Generate 1 - a, or ~a depending on bld->type.
    252  */
    253 LLVMValueRef
    254 lp_build_comp(struct lp_build_context *bld,
    255               LLVMValueRef a)
    256 {
    257    LLVMBuilderRef builder = bld->gallivm->builder;
    258    const struct lp_type type = bld->type;
    259 
    260    assert(lp_check_value(type, a));
    261 
    262    if(a == bld->one)
    263       return bld->zero;
    264    if(a == bld->zero)
    265       return bld->one;
    266 
    267    if(type.norm && !type.floating && !type.fixed && !type.sign) {
    268       if(LLVMIsConstant(a))
    269          return LLVMConstNot(a);
    270       else
    271          return LLVMBuildNot(builder, a, "");
    272    }
    273 
    274    if(LLVMIsConstant(a))
    275       if (type.floating)
    276           return LLVMConstFSub(bld->one, a);
    277       else
    278           return LLVMConstSub(bld->one, a);
    279    else
    280       if (type.floating)
    281          return LLVMBuildFSub(builder, bld->one, a, "");
    282       else
    283          return LLVMBuildSub(builder, bld->one, a, "");
    284 }
    285 
    286 
    287 /**
    288  * Generate a + b
    289  */
    290 LLVMValueRef
    291 lp_build_add(struct lp_build_context *bld,
    292              LLVMValueRef a,
    293              LLVMValueRef b)
    294 {
    295    LLVMBuilderRef builder = bld->gallivm->builder;
    296    const struct lp_type type = bld->type;
    297    LLVMValueRef res;
    298 
    299    assert(lp_check_value(type, a));
    300    assert(lp_check_value(type, b));
    301 
    302    if(a == bld->zero)
    303       return b;
    304    if(b == bld->zero)
    305       return a;
    306    if(a == bld->undef || b == bld->undef)
    307       return bld->undef;
    308 
    309    if(bld->type.norm) {
    310       const char *intrinsic = NULL;
    311 
    312       if(a == bld->one || b == bld->one)
    313         return bld->one;
    314 
    315       if(util_cpu_caps.has_sse2 &&
    316          type.width * type.length == 128 &&
    317          !type.floating && !type.fixed) {
    318          if(type.width == 8)
    319             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
    320          if(type.width == 16)
    321             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
    322       }
    323 
    324       if(intrinsic)
    325          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
    326    }
    327 
    328    if(LLVMIsConstant(a) && LLVMIsConstant(b))
    329       if (type.floating)
    330          res = LLVMConstFAdd(a, b);
    331       else
    332          res = LLVMConstAdd(a, b);
    333    else
    334       if (type.floating)
    335          res = LLVMBuildFAdd(builder, a, b, "");
    336       else
    337          res = LLVMBuildAdd(builder, a, b, "");
    338 
    339    /* clamp to ceiling of 1.0 */
    340    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
    341       res = lp_build_min_simple(bld, res, bld->one);
    342 
    343    /* XXX clamp to floor of -1 or 0??? */
    344 
    345    return res;
    346 }
    347 
    348 
    349 /** Return the scalar sum of the elements of a.
    350  * Should avoid this operation whenever possible.
    351  */
    352 LLVMValueRef
    353 lp_build_horizontal_add(struct lp_build_context *bld,
    354                         LLVMValueRef a)
    355 {
    356    LLVMBuilderRef builder = bld->gallivm->builder;
    357    const struct lp_type type = bld->type;
    358    LLVMValueRef index, res;
    359    unsigned i, length;
    360    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
    361    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
    362    LLVMValueRef vecres, elem2;
    363 
    364    assert(lp_check_value(type, a));
    365 
    366    if (type.length == 1) {
    367       return a;
    368    }
    369 
    370    assert(!bld->type.norm);
    371 
    372    /*
    373     * for byte vectors can do much better with psadbw.
    374     * Using repeated shuffle/adds here. Note with multiple vectors
    375     * this can be done more efficiently as outlined in the intel
    376     * optimization manual.
    377     * Note: could cause data rearrangement if used with smaller element
    378     * sizes.
    379     */
    380 
    381    vecres = a;
    382    length = type.length / 2;
    383    while (length > 1) {
    384       LLVMValueRef vec1, vec2;
    385       for (i = 0; i < length; i++) {
    386          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
    387          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
    388       }
    389       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
    390                                     LLVMConstVector(shuffles1, length), "");
    391       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
    392                                     LLVMConstVector(shuffles2, length), "");
    393       if (type.floating) {
    394          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
    395       }
    396       else {
    397          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
    398       }
    399       length = length >> 1;
    400    }
    401 
    402    /* always have vector of size 2 here */
    403    assert(length == 1);
    404 
    405    index = lp_build_const_int32(bld->gallivm, 0);
    406    res = LLVMBuildExtractElement(builder, vecres, index, "");
    407    index = lp_build_const_int32(bld->gallivm, 1);
    408    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
    409 
    410    if (type.floating)
    411       res = LLVMBuildFAdd(builder, res, elem2, "");
    412     else
    413       res = LLVMBuildAdd(builder, res, elem2, "");
    414 
    415    return res;
    416 }
    417 
    418 /**
    419  * Return the horizontal sums of 4 float vectors as a float4 vector.
    420  * This uses the technique as outlined in Intel Optimization Manual.
    421  */
    422 static LLVMValueRef
    423 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
    424                             LLVMValueRef src[4])
    425 {
    426    struct gallivm_state *gallivm = bld->gallivm;
    427    LLVMBuilderRef builder = gallivm->builder;
    428    LLVMValueRef shuffles[4];
    429    LLVMValueRef tmp[4];
    430    LLVMValueRef sumtmp[2], shuftmp[2];
    431 
    432    /* lower half of regs */
    433    shuffles[0] = lp_build_const_int32(gallivm, 0);
    434    shuffles[1] = lp_build_const_int32(gallivm, 1);
    435    shuffles[2] = lp_build_const_int32(gallivm, 4);
    436    shuffles[3] = lp_build_const_int32(gallivm, 5);
    437    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
    438                                    LLVMConstVector(shuffles, 4), "");
    439    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
    440                                    LLVMConstVector(shuffles, 4), "");
    441 
    442    /* upper half of regs */
    443    shuffles[0] = lp_build_const_int32(gallivm, 2);
    444    shuffles[1] = lp_build_const_int32(gallivm, 3);
    445    shuffles[2] = lp_build_const_int32(gallivm, 6);
    446    shuffles[3] = lp_build_const_int32(gallivm, 7);
    447    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
    448                                    LLVMConstVector(shuffles, 4), "");
    449    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
    450                                    LLVMConstVector(shuffles, 4), "");
    451 
    452    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
    453    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
    454 
    455    shuffles[0] = lp_build_const_int32(gallivm, 0);
    456    shuffles[1] = lp_build_const_int32(gallivm, 2);
    457    shuffles[2] = lp_build_const_int32(gallivm, 4);
    458    shuffles[3] = lp_build_const_int32(gallivm, 6);
    459    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
    460                                        LLVMConstVector(shuffles, 4), "");
    461 
    462    shuffles[0] = lp_build_const_int32(gallivm, 1);
    463    shuffles[1] = lp_build_const_int32(gallivm, 3);
    464    shuffles[2] = lp_build_const_int32(gallivm, 5);
    465    shuffles[3] = lp_build_const_int32(gallivm, 7);
    466    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
    467                                        LLVMConstVector(shuffles, 4), "");
    468 
    469    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
    470 }
    471 
    472 
    473 /*
    474  * partially horizontally add 2-4 float vectors with length nx4,
    475  * i.e. only four adjacent values in each vector will be added,
    476  * assuming values are really grouped in 4 which also determines
    477  * output order.
    478  *
    479  * Return a vector of the same length as the initial vectors,
    480  * with the excess elements (if any) being undefined.
    481  * The element order is independent of number of input vectors.
    482  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
    483  * the output order thus will be
    484  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
    485  */
    486 LLVMValueRef
    487 lp_build_hadd_partial4(struct lp_build_context *bld,
    488                        LLVMValueRef vectors[],
    489                        unsigned num_vecs)
    490 {
    491    struct gallivm_state *gallivm = bld->gallivm;
    492    LLVMBuilderRef builder = gallivm->builder;
    493    LLVMValueRef ret_vec;
    494    LLVMValueRef tmp[4];
    495    const char *intrinsic = NULL;
    496 
    497    assert(num_vecs >= 2 && num_vecs <= 4);
    498    assert(bld->type.floating);
    499 
    500    /* only use this with at least 2 vectors, as it is sort of expensive
    501     * (depending on cpu) and we always need two horizontal adds anyway,
    502     * so a shuffle/add approach might be better.
    503     */
    504 
    505    tmp[0] = vectors[0];
    506    tmp[1] = vectors[1];
    507 
    508    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
    509    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
    510 
    511    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
    512        bld->type.length == 4) {
    513       intrinsic = "llvm.x86.sse3.hadd.ps";
    514    }
    515    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
    516             bld->type.length == 8) {
    517       intrinsic = "llvm.x86.avx.hadd.ps.256";
    518    }
    519    if (intrinsic) {
    520       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
    521                                        lp_build_vec_type(gallivm, bld->type),
    522                                        tmp[0], tmp[1]);
    523       if (num_vecs > 2) {
    524          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
    525                                           lp_build_vec_type(gallivm, bld->type),
    526                                           tmp[2], tmp[3]);
    527       }
    528       else {
    529          tmp[1] = tmp[0];
    530       }
    531       return lp_build_intrinsic_binary(builder, intrinsic,
    532                                        lp_build_vec_type(gallivm, bld->type),
    533                                        tmp[0], tmp[1]);
    534    }
    535 
    536    if (bld->type.length == 4) {
    537       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
    538    }
    539    else {
    540       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
    541       unsigned j;
    542       unsigned num_iter = bld->type.length / 4;
    543       struct lp_type parttype = bld->type;
    544       parttype.length = 4;
    545       for (j = 0; j < num_iter; j++) {
    546          LLVMValueRef partsrc[4];
    547          unsigned i;
    548          for (i = 0; i < 4; i++) {
    549             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
    550          }
    551          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
    552       }
    553       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
    554    }
    555    return ret_vec;
    556 }
    557 
    558 /**
    559  * Generate a - b
    560  */
    561 LLVMValueRef
    562 lp_build_sub(struct lp_build_context *bld,
    563              LLVMValueRef a,
    564              LLVMValueRef b)
    565 {
    566    LLVMBuilderRef builder = bld->gallivm->builder;
    567    const struct lp_type type = bld->type;
    568    LLVMValueRef res;
    569 
    570    assert(lp_check_value(type, a));
    571    assert(lp_check_value(type, b));
    572 
    573    if(b == bld->zero)
    574       return a;
    575    if(a == bld->undef || b == bld->undef)
    576       return bld->undef;
    577    if(a == b)
    578       return bld->zero;
    579 
    580    if(bld->type.norm) {
    581       const char *intrinsic = NULL;
    582 
    583       if(b == bld->one)
    584         return bld->zero;
    585 
    586       if(util_cpu_caps.has_sse2 &&
    587          type.width * type.length == 128 &&
    588          !type.floating && !type.fixed) {
    589          if(type.width == 8)
    590             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
    591          if(type.width == 16)
    592             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
    593       }
    594 
    595       if(intrinsic)
    596          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
    597    }
    598 
    599    if(LLVMIsConstant(a) && LLVMIsConstant(b))
    600       if (type.floating)
    601          res = LLVMConstFSub(a, b);
    602       else
    603          res = LLVMConstSub(a, b);
    604    else
    605       if (type.floating)
    606          res = LLVMBuildFSub(builder, a, b, "");
    607       else
    608          res = LLVMBuildSub(builder, a, b, "");
    609 
    610    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
    611       res = lp_build_max_simple(bld, res, bld->zero);
    612 
    613    return res;
    614 }
    615 
    616 
    617 /**
    618  * Normalized 8bit multiplication.
    619  *
    620  * - alpha plus one
    621  *
    622  *     makes the following approximation to the division (Sree)
    623  *
    624  *       a*b/255 ~= (a*(b + 1)) >> 256
    625  *
    626  *     which is the fastest method that satisfies the following OpenGL criteria
    627  *
    628  *       0*0 = 0 and 255*255 = 255
    629  *
    630  * - geometric series
    631  *
    632  *     takes the geometric series approximation to the division
    633  *
    634  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
    635  *
    636  *     in this case just the first two terms to fit in 16bit arithmetic
    637  *
    638  *       t/255 ~= (t + (t >> 8)) >> 8
    639  *
    640  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
    641  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
    642  *     must be used
    643  *
    644  * - geometric series plus rounding
    645  *
    646  *     when using a geometric series division instead of truncating the result
    647  *     use roundoff in the approximation (Jim Blinn)
    648  *
    649  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
    650  *
    651  *     achieving the exact results
    652  *
    653  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
    654  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
    655  * @sa Michael Herf, The "double blend trick", May 2000,
    656  *     http://www.stereopsis.com/doubleblend.html
    657  */
    658 static LLVMValueRef
    659 lp_build_mul_u8n(struct gallivm_state *gallivm,
    660                  struct lp_type i16_type,
    661                  LLVMValueRef a, LLVMValueRef b)
    662 {
    663    LLVMBuilderRef builder = gallivm->builder;
    664    LLVMValueRef c8;
    665    LLVMValueRef ab;
    666 
    667    assert(!i16_type.floating);
    668    assert(lp_check_value(i16_type, a));
    669    assert(lp_check_value(i16_type, b));
    670 
    671    c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
    672 
    673 #if 0
    674 
    675    /* a*b/255 ~= (a*(b + 1)) >> 256 */
    676    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
    677    ab = LLVMBuildMul(builder, a, b, "");
    678 
    679 #else
    680 
    681    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
    682    ab = LLVMBuildMul(builder, a, b, "");
    683    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
    684    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
    685 
    686 #endif
    687 
    688    ab = LLVMBuildLShr(builder, ab, c8, "");
    689 
    690    return ab;
    691 }
    692 
    693 
    694 /**
    695  * Generate a * b
    696  */
    697 LLVMValueRef
    698 lp_build_mul(struct lp_build_context *bld,
    699              LLVMValueRef a,
    700              LLVMValueRef b)
    701 {
    702    LLVMBuilderRef builder = bld->gallivm->builder;
    703    const struct lp_type type = bld->type;
    704    LLVMValueRef shift;
    705    LLVMValueRef res;
    706 
    707    assert(lp_check_value(type, a));
    708    assert(lp_check_value(type, b));
    709 
    710    if(a == bld->zero)
    711       return bld->zero;
    712    if(a == bld->one)
    713       return b;
    714    if(b == bld->zero)
    715       return bld->zero;
    716    if(b == bld->one)
    717       return a;
    718    if(a == bld->undef || b == bld->undef)
    719       return bld->undef;
    720 
    721    if(!type.floating && !type.fixed && type.norm) {
    722       if(type.width == 8) {
    723          struct lp_type i16_type = lp_wider_type(type);
    724          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
    725 
    726          lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
    727          lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
    728 
    729          /* PMULLW, PSRLW, PADDW */
    730          abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
    731          abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
    732 
    733          ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
    734 
    735          return ab;
    736       }
    737 
    738       /* FIXME */
    739       assert(0);
    740    }
    741 
    742    if(type.fixed)
    743       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
    744    else
    745       shift = NULL;
    746 
    747    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
    748       if (type.floating)
    749          res = LLVMConstFMul(a, b);
    750       else
    751          res = LLVMConstMul(a, b);
    752       if(shift) {
    753          if(type.sign)
    754             res = LLVMConstAShr(res, shift);
    755          else
    756             res = LLVMConstLShr(res, shift);
    757       }
    758    }
    759    else {
    760       if (type.floating)
    761          res = LLVMBuildFMul(builder, a, b, "");
    762       else
    763          res = LLVMBuildMul(builder, a, b, "");
    764       if(shift) {
    765          if(type.sign)
    766             res = LLVMBuildAShr(builder, res, shift, "");
    767          else
    768             res = LLVMBuildLShr(builder, res, shift, "");
    769       }
    770    }
    771 
    772    return res;
    773 }
    774 
    775 
    776 /**
    777  * Small vector x scale multiplication optimization.
    778  */
    779 LLVMValueRef
    780 lp_build_mul_imm(struct lp_build_context *bld,
    781                  LLVMValueRef a,
    782                  int b)
    783 {
    784    LLVMBuilderRef builder = bld->gallivm->builder;
    785    LLVMValueRef factor;
    786 
    787    assert(lp_check_value(bld->type, a));
    788 
    789    if(b == 0)
    790       return bld->zero;
    791 
    792    if(b == 1)
    793       return a;
    794 
    795    if(b == -1)
    796       return lp_build_negate(bld, a);
    797 
    798    if(b == 2 && bld->type.floating)
    799       return lp_build_add(bld, a, a);
    800 
    801    if(util_is_power_of_two(b)) {
    802       unsigned shift = ffs(b) - 1;
    803 
    804       if(bld->type.floating) {
    805 #if 0
    806          /*
    807           * Power of two multiplication by directly manipulating the exponent.
    808           *
    809           * XXX: This might not be always faster, it will introduce a small error
    810           * for multiplication by zero, and it will produce wrong results
    811           * for Inf and NaN.
    812           */
    813          unsigned mantissa = lp_mantissa(bld->type);
    814          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
    815          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
    816          a = LLVMBuildAdd(builder, a, factor, "");
    817          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
    818          return a;
    819 #endif
    820       }
    821       else {
    822          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
    823          return LLVMBuildShl(builder, a, factor, "");
    824       }
    825    }
    826 
    827    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
    828    return lp_build_mul(bld, a, factor);
    829 }
    830 
    831 
    832 /**
    833  * Generate a / b
    834  */
    835 LLVMValueRef
    836 lp_build_div(struct lp_build_context *bld,
    837              LLVMValueRef a,
    838              LLVMValueRef b)
    839 {
    840    LLVMBuilderRef builder = bld->gallivm->builder;
    841    const struct lp_type type = bld->type;
    842 
    843    assert(lp_check_value(type, a));
    844    assert(lp_check_value(type, b));
    845 
    846    if(a == bld->zero)
    847       return bld->zero;
    848    if(a == bld->one)
    849       return lp_build_rcp(bld, b);
    850    if(b == bld->zero)
    851       return bld->undef;
    852    if(b == bld->one)
    853       return a;
    854    if(a == bld->undef || b == bld->undef)
    855       return bld->undef;
    856 
    857    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
    858       if (type.floating)
    859          return LLVMConstFDiv(a, b);
    860       else if (type.sign)
    861          return LLVMConstSDiv(a, b);
    862       else
    863          return LLVMConstUDiv(a, b);
    864    }
    865 
    866    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
    867        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
    868       type.floating)
    869       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
    870 
    871    if (type.floating)
    872       return LLVMBuildFDiv(builder, a, b, "");
    873    else if (type.sign)
    874       return LLVMBuildSDiv(builder, a, b, "");
    875    else
    876       return LLVMBuildUDiv(builder, a, b, "");
    877 }
    878 
    879 
    880 /**
    881  * Linear interpolation -- without any checks.
    882  *
    883  * @sa http://www.stereopsis.com/doubleblend.html
    884  */
    885 static INLINE LLVMValueRef
    886 lp_build_lerp_simple(struct lp_build_context *bld,
    887                      LLVMValueRef x,
    888                      LLVMValueRef v0,
    889                      LLVMValueRef v1)
    890 {
    891    LLVMBuilderRef builder = bld->gallivm->builder;
    892    LLVMValueRef delta;
    893    LLVMValueRef res;
    894 
    895    assert(lp_check_value(bld->type, x));
    896    assert(lp_check_value(bld->type, v0));
    897    assert(lp_check_value(bld->type, v1));
    898 
    899    delta = lp_build_sub(bld, v1, v0);
    900 
    901    res = lp_build_mul(bld, x, delta);
    902 
    903    res = lp_build_add(bld, v0, res);
    904 
    905    if (bld->type.fixed) {
    906       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
    907        * but it will be wrong for other uses. Basically we need a more
    908        * powerful lp_type, capable of further distinguishing the values
    909        * interpretation from the value storage. */
    910       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
    911    }
    912 
    913    return res;
    914 }
    915 
    916 
    917 /**
    918  * Linear interpolation.
    919  */
    920 LLVMValueRef
    921 lp_build_lerp(struct lp_build_context *bld,
    922               LLVMValueRef x,
    923               LLVMValueRef v0,
    924               LLVMValueRef v1)
    925 {
    926    LLVMBuilderRef builder = bld->gallivm->builder;
    927    const struct lp_type type = bld->type;
    928    LLVMValueRef res;
    929 
    930    assert(lp_check_value(type, x));
    931    assert(lp_check_value(type, v0));
    932    assert(lp_check_value(type, v1));
    933 
    934    if (type.norm) {
    935       struct lp_type wide_type;
    936       struct lp_build_context wide_bld;
    937       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
    938       LLVMValueRef shift;
    939 
    940       assert(type.length >= 2);
    941       assert(!type.sign);
    942 
    943       /*
    944        * Create a wider type, enough to hold the intermediate result of the
    945        * multiplication.
    946        */
    947       memset(&wide_type, 0, sizeof wide_type);
    948       wide_type.fixed  = TRUE;
    949       wide_type.width  = type.width*2;
    950       wide_type.length = type.length/2;
    951 
    952       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
    953 
    954       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
    955       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
    956       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
    957 
    958       /*
    959        * Scale x from [0, 255] to [0, 256]
    960        */
    961 
    962       shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
    963 
    964       xl = lp_build_add(&wide_bld, xl,
    965                         LLVMBuildAShr(builder, xl, shift, ""));
    966       xh = lp_build_add(&wide_bld, xh,
    967                         LLVMBuildAShr(builder, xh, shift, ""));
    968 
    969       /*
    970        * Lerp both halves.
    971        */
    972 
    973       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
    974       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
    975 
    976       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
    977    } else {
    978       res = lp_build_lerp_simple(bld, x, v0, v1);
    979    }
    980 
    981    return res;
    982 }
    983 
    984 
    985 LLVMValueRef
    986 lp_build_lerp_2d(struct lp_build_context *bld,
    987                  LLVMValueRef x,
    988                  LLVMValueRef y,
    989                  LLVMValueRef v00,
    990                  LLVMValueRef v01,
    991                  LLVMValueRef v10,
    992                  LLVMValueRef v11)
    993 {
    994    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
    995    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
    996    return lp_build_lerp(bld, y, v0, v1);
    997 }
    998 
    999 
   1000 /**
   1001  * Generate min(a, b)
   1002  * Do checks for special cases.
   1003  */
   1004 LLVMValueRef
   1005 lp_build_min(struct lp_build_context *bld,
   1006              LLVMValueRef a,
   1007              LLVMValueRef b)
   1008 {
   1009    assert(lp_check_value(bld->type, a));
   1010    assert(lp_check_value(bld->type, b));
   1011 
   1012    if(a == bld->undef || b == bld->undef)
   1013       return bld->undef;
   1014 
   1015    if(a == b)
   1016       return a;
   1017 
   1018    if (bld->type.norm) {
   1019       if (!bld->type.sign) {
   1020          if (a == bld->zero || b == bld->zero) {
   1021             return bld->zero;
   1022          }
   1023       }
   1024       if(a == bld->one)
   1025          return b;
   1026       if(b == bld->one)
   1027          return a;
   1028    }
   1029 
   1030    return lp_build_min_simple(bld, a, b);
   1031 }
   1032 
   1033 
   1034 /**
   1035  * Generate max(a, b)
   1036  * Do checks for special cases.
   1037  */
   1038 LLVMValueRef
   1039 lp_build_max(struct lp_build_context *bld,
   1040              LLVMValueRef a,
   1041              LLVMValueRef b)
   1042 {
   1043    assert(lp_check_value(bld->type, a));
   1044    assert(lp_check_value(bld->type, b));
   1045 
   1046    if(a == bld->undef || b == bld->undef)
   1047       return bld->undef;
   1048 
   1049    if(a == b)
   1050       return a;
   1051 
   1052    if(bld->type.norm) {
   1053       if(a == bld->one || b == bld->one)
   1054          return bld->one;
   1055       if (!bld->type.sign) {
   1056          if (a == bld->zero) {
   1057             return b;
   1058          }
   1059          if (b == bld->zero) {
   1060             return a;
   1061          }
   1062       }
   1063    }
   1064 
   1065    return lp_build_max_simple(bld, a, b);
   1066 }
   1067 
   1068 
   1069 /**
   1070  * Generate clamp(a, min, max)
   1071  * Do checks for special cases.
   1072  */
   1073 LLVMValueRef
   1074 lp_build_clamp(struct lp_build_context *bld,
   1075                LLVMValueRef a,
   1076                LLVMValueRef min,
   1077                LLVMValueRef max)
   1078 {
   1079    assert(lp_check_value(bld->type, a));
   1080    assert(lp_check_value(bld->type, min));
   1081    assert(lp_check_value(bld->type, max));
   1082 
   1083    a = lp_build_min(bld, a, max);
   1084    a = lp_build_max(bld, a, min);
   1085    return a;
   1086 }
   1087 
   1088 
   1089 /**
   1090  * Generate abs(a)
   1091  */
   1092 LLVMValueRef
   1093 lp_build_abs(struct lp_build_context *bld,
   1094              LLVMValueRef a)
   1095 {
   1096    LLVMBuilderRef builder = bld->gallivm->builder;
   1097    const struct lp_type type = bld->type;
   1098    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1099 
   1100    assert(lp_check_value(type, a));
   1101 
   1102    if(!type.sign)
   1103       return a;
   1104 
   1105    if(type.floating) {
   1106       /* Mask out the sign bit */
   1107       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1108       unsigned long long absMask = ~(1ULL << (type.width - 1));
   1109       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
   1110       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1111       a = LLVMBuildAnd(builder, a, mask, "");
   1112       a = LLVMBuildBitCast(builder, a, vec_type, "");
   1113       return a;
   1114    }
   1115 
   1116    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
   1117       switch(type.width) {
   1118       case 8:
   1119          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
   1120       case 16:
   1121          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
   1122       case 32:
   1123          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
   1124       }
   1125    }
   1126    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
   1127             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
   1128             (type.width == 8 || type.width == 16 || type.width == 32)) {
   1129       debug_printf("%s: inefficient code, should split vectors manually\n",
   1130                    __FUNCTION__);
   1131    }
   1132 
   1133    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
   1134 }
   1135 
   1136 
   1137 LLVMValueRef
   1138 lp_build_negate(struct lp_build_context *bld,
   1139                 LLVMValueRef a)
   1140 {
   1141    LLVMBuilderRef builder = bld->gallivm->builder;
   1142 
   1143    assert(lp_check_value(bld->type, a));
   1144 
   1145 #if HAVE_LLVM >= 0x0207
   1146    if (bld->type.floating)
   1147       a = LLVMBuildFNeg(builder, a, "");
   1148    else
   1149 #endif
   1150       a = LLVMBuildNeg(builder, a, "");
   1151 
   1152    return a;
   1153 }
   1154 
   1155 
   1156 /** Return -1, 0 or +1 depending on the sign of a */
   1157 LLVMValueRef
   1158 lp_build_sgn(struct lp_build_context *bld,
   1159              LLVMValueRef a)
   1160 {
   1161    LLVMBuilderRef builder = bld->gallivm->builder;
   1162    const struct lp_type type = bld->type;
   1163    LLVMValueRef cond;
   1164    LLVMValueRef res;
   1165 
   1166    assert(lp_check_value(type, a));
   1167 
   1168    /* Handle non-zero case */
   1169    if(!type.sign) {
   1170       /* if not zero then sign must be positive */
   1171       res = bld->one;
   1172    }
   1173    else if(type.floating) {
   1174       LLVMTypeRef vec_type;
   1175       LLVMTypeRef int_type;
   1176       LLVMValueRef mask;
   1177       LLVMValueRef sign;
   1178       LLVMValueRef one;
   1179       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
   1180 
   1181       int_type = lp_build_int_vec_type(bld->gallivm, type);
   1182       vec_type = lp_build_vec_type(bld->gallivm, type);
   1183       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
   1184 
   1185       /* Take the sign bit and add it to 1 constant */
   1186       sign = LLVMBuildBitCast(builder, a, int_type, "");
   1187       sign = LLVMBuildAnd(builder, sign, mask, "");
   1188       one = LLVMConstBitCast(bld->one, int_type);
   1189       res = LLVMBuildOr(builder, sign, one, "");
   1190       res = LLVMBuildBitCast(builder, res, vec_type, "");
   1191    }
   1192    else
   1193    {
   1194       /* signed int/norm/fixed point */
   1195       /* could use psign with sse3 and appropriate vectors here */
   1196       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
   1197       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
   1198       res = lp_build_select(bld, cond, bld->one, minus_one);
   1199    }
   1200 
   1201    /* Handle zero */
   1202    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
   1203    res = lp_build_select(bld, cond, bld->zero, res);
   1204 
   1205    return res;
   1206 }
   1207 
   1208 
   1209 /**
   1210  * Set the sign of float vector 'a' according to 'sign'.
   1211  * If sign==0, return abs(a).
   1212  * If sign==1, return -abs(a);
   1213  * Other values for sign produce undefined results.
   1214  */
   1215 LLVMValueRef
   1216 lp_build_set_sign(struct lp_build_context *bld,
   1217                   LLVMValueRef a, LLVMValueRef sign)
   1218 {
   1219    LLVMBuilderRef builder = bld->gallivm->builder;
   1220    const struct lp_type type = bld->type;
   1221    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1222    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1223    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
   1224    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   1225                              ~((unsigned long long) 1 << (type.width - 1)));
   1226    LLVMValueRef val, res;
   1227 
   1228    assert(type.floating);
   1229    assert(lp_check_value(type, a));
   1230 
   1231    /* val = reinterpret_cast<int>(a) */
   1232    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1233    /* val = val & mask */
   1234    val = LLVMBuildAnd(builder, val, mask, "");
   1235    /* sign = sign << shift */
   1236    sign = LLVMBuildShl(builder, sign, shift, "");
   1237    /* res = val | sign */
   1238    res = LLVMBuildOr(builder, val, sign, "");
   1239    /* res = reinterpret_cast<float>(res) */
   1240    res = LLVMBuildBitCast(builder, res, vec_type, "");
   1241 
   1242    return res;
   1243 }
   1244 
   1245 
   1246 /**
   1247  * Convert vector of (or scalar) int to vector of (or scalar) float.
   1248  */
   1249 LLVMValueRef
   1250 lp_build_int_to_float(struct lp_build_context *bld,
   1251                       LLVMValueRef a)
   1252 {
   1253    LLVMBuilderRef builder = bld->gallivm->builder;
   1254    const struct lp_type type = bld->type;
   1255    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1256 
   1257    assert(type.floating);
   1258 
   1259    return LLVMBuildSIToFP(builder, a, vec_type, "");
   1260 }
   1261 
   1262 static boolean
   1263 sse41_rounding_available(const struct lp_type type)
   1264 {
   1265    if ((util_cpu_caps.has_sse4_1 &&
   1266        (type.length == 1 || type.width*type.length == 128)) ||
   1267        (util_cpu_caps.has_avx && type.width*type.length == 256))
   1268       return TRUE;
   1269 
   1270    return FALSE;
   1271 }
   1272 
   1273 enum lp_build_round_sse41_mode
   1274 {
   1275    LP_BUILD_ROUND_SSE41_NEAREST = 0,
   1276    LP_BUILD_ROUND_SSE41_FLOOR = 1,
   1277    LP_BUILD_ROUND_SSE41_CEIL = 2,
   1278    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
   1279 };
   1280 
   1281 
   1282 /**
   1283  * Helper for SSE4.1's ROUNDxx instructions.
   1284  *
   1285  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
   1286  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
   1287  */
   1288 static INLINE LLVMValueRef
   1289 lp_build_round_sse41(struct lp_build_context *bld,
   1290                      LLVMValueRef a,
   1291                      enum lp_build_round_sse41_mode mode)
   1292 {
   1293    LLVMBuilderRef builder = bld->gallivm->builder;
   1294    const struct lp_type type = bld->type;
   1295    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   1296    const char *intrinsic;
   1297    LLVMValueRef res;
   1298 
   1299    assert(type.floating);
   1300 
   1301    assert(lp_check_value(type, a));
   1302    assert(util_cpu_caps.has_sse4_1);
   1303 
   1304    if (type.length == 1) {
   1305       LLVMTypeRef vec_type;
   1306       LLVMValueRef undef;
   1307       LLVMValueRef args[3];
   1308       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
   1309 
   1310       switch(type.width) {
   1311       case 32:
   1312          intrinsic = "llvm.x86.sse41.round.ss";
   1313          break;
   1314       case 64:
   1315          intrinsic = "llvm.x86.sse41.round.sd";
   1316          break;
   1317       default:
   1318          assert(0);
   1319          return bld->undef;
   1320       }
   1321 
   1322       vec_type = LLVMVectorType(bld->elem_type, 4);
   1323 
   1324       undef = LLVMGetUndef(vec_type);
   1325 
   1326       args[0] = undef;
   1327       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
   1328       args[2] = LLVMConstInt(i32t, mode, 0);
   1329 
   1330       res = lp_build_intrinsic(builder, intrinsic,
   1331                                vec_type, args, Elements(args));
   1332 
   1333       res = LLVMBuildExtractElement(builder, res, index0, "");
   1334    }
   1335    else {
   1336       if (type.width * type.length == 128) {
   1337          switch(type.width) {
   1338          case 32:
   1339             intrinsic = "llvm.x86.sse41.round.ps";
   1340             break;
   1341          case 64:
   1342             intrinsic = "llvm.x86.sse41.round.pd";
   1343             break;
   1344          default:
   1345             assert(0);
   1346             return bld->undef;
   1347          }
   1348       }
   1349       else {
   1350          assert(type.width * type.length == 256);
   1351          assert(util_cpu_caps.has_avx);
   1352 
   1353          switch(type.width) {
   1354          case 32:
   1355             intrinsic = "llvm.x86.avx.round.ps.256";
   1356             break;
   1357          case 64:
   1358             intrinsic = "llvm.x86.avx.round.pd.256";
   1359             break;
   1360          default:
   1361             assert(0);
   1362             return bld->undef;
   1363          }
   1364       }
   1365 
   1366       res = lp_build_intrinsic_binary(builder, intrinsic,
   1367                                       bld->vec_type, a,
   1368                                       LLVMConstInt(i32t, mode, 0));
   1369    }
   1370 
   1371    return res;
   1372 }
   1373 
   1374 
   1375 static INLINE LLVMValueRef
   1376 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
   1377                              LLVMValueRef a)
   1378 {
   1379    LLVMBuilderRef builder = bld->gallivm->builder;
   1380    const struct lp_type type = bld->type;
   1381    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   1382    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
   1383    const char *intrinsic;
   1384    LLVMValueRef res;
   1385 
   1386    assert(type.floating);
   1387    /* using the double precision conversions is a bit more complicated */
   1388    assert(type.width == 32);
   1389 
   1390    assert(lp_check_value(type, a));
   1391    assert(util_cpu_caps.has_sse2);
   1392 
   1393    /* This is relying on MXCSR rounding mode, which should always be nearest. */
   1394    if (type.length == 1) {
   1395       LLVMTypeRef vec_type;
   1396       LLVMValueRef undef;
   1397       LLVMValueRef arg;
   1398       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
   1399 
   1400       vec_type = LLVMVectorType(bld->elem_type, 4);
   1401 
   1402       intrinsic = "llvm.x86.sse.cvtss2si";
   1403 
   1404       undef = LLVMGetUndef(vec_type);
   1405 
   1406       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
   1407 
   1408       res = lp_build_intrinsic_unary(builder, intrinsic,
   1409                                      ret_type, arg);
   1410    }
   1411    else {
   1412       if (type.width* type.length == 128) {
   1413          intrinsic = "llvm.x86.sse2.cvtps2dq";
   1414       }
   1415       else {
   1416          assert(type.width*type.length == 256);
   1417          assert(util_cpu_caps.has_avx);
   1418 
   1419          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
   1420       }
   1421       res = lp_build_intrinsic_unary(builder, intrinsic,
   1422                                      ret_type, a);
   1423    }
   1424 
   1425    return res;
   1426 }
   1427 
   1428 
   1429 /**
   1430  * Return the integer part of a float (vector) value (== round toward zero).
   1431  * The returned value is a float (vector).
   1432  * Ex: trunc(-1.5) = -1.0
   1433  */
   1434 LLVMValueRef
   1435 lp_build_trunc(struct lp_build_context *bld,
   1436                LLVMValueRef a)
   1437 {
   1438    LLVMBuilderRef builder = bld->gallivm->builder;
   1439    const struct lp_type type = bld->type;
   1440 
   1441    assert(type.floating);
   1442    assert(lp_check_value(type, a));
   1443 
   1444    if (sse41_rounding_available(type)) {
   1445       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
   1446    }
   1447    else {
   1448       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1449       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1450       LLVMValueRef res;
   1451       res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   1452       res = LLVMBuildSIToFP(builder, res, vec_type, "");
   1453       return res;
   1454    }
   1455 }
   1456 
   1457 
   1458 /**
   1459  * Return float (vector) rounded to nearest integer (vector).  The returned
   1460  * value is a float (vector).
   1461  * Ex: round(0.9) = 1.0
   1462  * Ex: round(-1.5) = -2.0
   1463  */
   1464 LLVMValueRef
   1465 lp_build_round(struct lp_build_context *bld,
   1466                LLVMValueRef a)
   1467 {
   1468    LLVMBuilderRef builder = bld->gallivm->builder;
   1469    const struct lp_type type = bld->type;
   1470 
   1471    assert(type.floating);
   1472    assert(lp_check_value(type, a));
   1473 
   1474    if (sse41_rounding_available(type)) {
   1475       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
   1476    }
   1477    else {
   1478       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1479       LLVMValueRef res;
   1480       res = lp_build_iround(bld, a);
   1481       res = LLVMBuildSIToFP(builder, res, vec_type, "");
   1482       return res;
   1483    }
   1484 }
   1485 
   1486 
   1487 /**
   1488  * Return floor of float (vector), result is a float (vector)
   1489  * Ex: floor(1.1) = 1.0
   1490  * Ex: floor(-1.1) = -2.0
   1491  */
   1492 LLVMValueRef
   1493 lp_build_floor(struct lp_build_context *bld,
   1494                LLVMValueRef a)
   1495 {
   1496    LLVMBuilderRef builder = bld->gallivm->builder;
   1497    const struct lp_type type = bld->type;
   1498 
   1499    assert(type.floating);
   1500    assert(lp_check_value(type, a));
   1501 
   1502    if (sse41_rounding_available(type)) {
   1503       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
   1504    }
   1505    else {
   1506       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1507       LLVMValueRef res;
   1508       res = lp_build_ifloor(bld, a);
   1509       res = LLVMBuildSIToFP(builder, res, vec_type, "");
   1510       return res;
   1511    }
   1512 }
   1513 
   1514 
   1515 /**
   1516  * Return ceiling of float (vector), returning float (vector).
   1517  * Ex: ceil( 1.1) = 2.0
   1518  * Ex: ceil(-1.1) = -1.0
   1519  */
   1520 LLVMValueRef
   1521 lp_build_ceil(struct lp_build_context *bld,
   1522               LLVMValueRef a)
   1523 {
   1524    LLVMBuilderRef builder = bld->gallivm->builder;
   1525    const struct lp_type type = bld->type;
   1526 
   1527    assert(type.floating);
   1528    assert(lp_check_value(type, a));
   1529 
   1530    if (sse41_rounding_available(type)) {
   1531       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
   1532    }
   1533    else {
   1534       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1535       LLVMValueRef res;
   1536       res = lp_build_iceil(bld, a);
   1537       res = LLVMBuildSIToFP(builder, res, vec_type, "");
   1538       return res;
   1539    }
   1540 }
   1541 
   1542 
   1543 /**
   1544  * Return fractional part of 'a' computed as a - floor(a)
   1545  * Typically used in texture coord arithmetic.
   1546  */
   1547 LLVMValueRef
   1548 lp_build_fract(struct lp_build_context *bld,
   1549                LLVMValueRef a)
   1550 {
   1551    assert(bld->type.floating);
   1552    return lp_build_sub(bld, a, lp_build_floor(bld, a));
   1553 }
   1554 
   1555 
   1556 /**
   1557  * Prevent returning a fractional part of 1.0 for very small negative values of
   1558  * 'a' by clamping against 0.99999(9).
   1559  */
   1560 static inline LLVMValueRef
   1561 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
   1562 {
   1563    LLVMValueRef max;
   1564 
   1565    /* this is the largest number smaller than 1.0 representable as float */
   1566    max = lp_build_const_vec(bld->gallivm, bld->type,
   1567                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
   1568    return lp_build_min(bld, fract, max);
   1569 }
   1570 
   1571 
   1572 /**
   1573  * Same as lp_build_fract, but guarantees that the result is always smaller
   1574  * than one.
   1575  */
   1576 LLVMValueRef
   1577 lp_build_fract_safe(struct lp_build_context *bld,
   1578                     LLVMValueRef a)
   1579 {
   1580    return clamp_fract(bld, lp_build_fract(bld, a));
   1581 }
   1582 
   1583 
   1584 /**
   1585  * Return the integer part of a float (vector) value (== round toward zero).
   1586  * The returned value is an integer (vector).
   1587  * Ex: itrunc(-1.5) = -1
   1588  */
   1589 LLVMValueRef
   1590 lp_build_itrunc(struct lp_build_context *bld,
   1591                 LLVMValueRef a)
   1592 {
   1593    LLVMBuilderRef builder = bld->gallivm->builder;
   1594    const struct lp_type type = bld->type;
   1595    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1596 
   1597    assert(type.floating);
   1598    assert(lp_check_value(type, a));
   1599 
   1600    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
   1601 }
   1602 
   1603 
   1604 /**
   1605  * Return float (vector) rounded to nearest integer (vector).  The returned
   1606  * value is an integer (vector).
   1607  * Ex: iround(0.9) = 1
   1608  * Ex: iround(-1.5) = -2
   1609  */
   1610 LLVMValueRef
   1611 lp_build_iround(struct lp_build_context *bld,
   1612                 LLVMValueRef a)
   1613 {
   1614    LLVMBuilderRef builder = bld->gallivm->builder;
   1615    const struct lp_type type = bld->type;
   1616    LLVMTypeRef int_vec_type = bld->int_vec_type;
   1617    LLVMValueRef res;
   1618 
   1619    assert(type.floating);
   1620 
   1621    assert(lp_check_value(type, a));
   1622 
   1623    if ((util_cpu_caps.has_sse2 &&
   1624        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
   1625        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
   1626       return lp_build_iround_nearest_sse2(bld, a);
   1627    }
   1628    if (sse41_rounding_available(type)) {
   1629       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
   1630    }
   1631    else {
   1632       LLVMValueRef half;
   1633 
   1634       half = lp_build_const_vec(bld->gallivm, type, 0.5);
   1635 
   1636       if (type.sign) {
   1637          LLVMTypeRef vec_type = bld->vec_type;
   1638          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   1639                                     (unsigned long long)1 << (type.width - 1));
   1640          LLVMValueRef sign;
   1641 
   1642          /* get sign bit */
   1643          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1644          sign = LLVMBuildAnd(builder, sign, mask, "");
   1645 
   1646          /* sign * 0.5 */
   1647          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
   1648          half = LLVMBuildOr(builder, sign, half, "");
   1649          half = LLVMBuildBitCast(builder, half, vec_type, "");
   1650       }
   1651 
   1652       res = LLVMBuildFAdd(builder, a, half, "");
   1653    }
   1654 
   1655    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
   1656 
   1657    return res;
   1658 }
   1659 
   1660 
   1661 /**
   1662  * Return floor of float (vector), result is an int (vector)
   1663  * Ex: ifloor(1.1) = 1.0
   1664  * Ex: ifloor(-1.1) = -2.0
   1665  */
   1666 LLVMValueRef
   1667 lp_build_ifloor(struct lp_build_context *bld,
   1668                 LLVMValueRef a)
   1669 {
   1670    LLVMBuilderRef builder = bld->gallivm->builder;
   1671    const struct lp_type type = bld->type;
   1672    LLVMTypeRef int_vec_type = bld->int_vec_type;
   1673    LLVMValueRef res;
   1674 
   1675    assert(type.floating);
   1676    assert(lp_check_value(type, a));
   1677 
   1678    res = a;
   1679    if (type.sign) {
   1680       if (sse41_rounding_available(type)) {
   1681          res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
   1682       }
   1683       else {
   1684          /* Take the sign bit and add it to 1 constant */
   1685          LLVMTypeRef vec_type = bld->vec_type;
   1686          unsigned mantissa = lp_mantissa(type);
   1687          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   1688                                   (unsigned long long)1 << (type.width - 1));
   1689          LLVMValueRef sign;
   1690          LLVMValueRef offset;
   1691 
   1692          /* sign = a < 0 ? ~0 : 0 */
   1693          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1694          sign = LLVMBuildAnd(builder, sign, mask, "");
   1695          sign = LLVMBuildAShr(builder, sign,
   1696                               lp_build_const_int_vec(bld->gallivm, type,
   1697                                                      type.width - 1),
   1698                               "ifloor.sign");
   1699 
   1700          /* offset = -0.99999(9)f */
   1701          offset = lp_build_const_vec(bld->gallivm, type,
   1702                                      -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
   1703          offset = LLVMConstBitCast(offset, int_vec_type);
   1704 
   1705          /* offset = a < 0 ? offset : 0.0f */
   1706          offset = LLVMBuildAnd(builder, offset, sign, "");
   1707          offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
   1708 
   1709          res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
   1710       }
   1711    }
   1712 
   1713    /* round to nearest (toward zero) */
   1714    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
   1715 
   1716    return res;
   1717 }
   1718 
   1719 
   1720 /**
   1721  * Return ceiling of float (vector), returning int (vector).
   1722  * Ex: iceil( 1.1) = 2
   1723  * Ex: iceil(-1.1) = -1
   1724  */
   1725 LLVMValueRef
   1726 lp_build_iceil(struct lp_build_context *bld,
   1727                LLVMValueRef a)
   1728 {
   1729    LLVMBuilderRef builder = bld->gallivm->builder;
   1730    const struct lp_type type = bld->type;
   1731    LLVMTypeRef int_vec_type = bld->int_vec_type;
   1732    LLVMValueRef res;
   1733 
   1734    assert(type.floating);
   1735    assert(lp_check_value(type, a));
   1736 
   1737    if (sse41_rounding_available(type)) {
   1738       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
   1739    }
   1740    else {
   1741       LLVMTypeRef vec_type = bld->vec_type;
   1742       unsigned mantissa = lp_mantissa(type);
   1743       LLVMValueRef offset;
   1744 
   1745       /* offset = 0.99999(9)f */
   1746       offset = lp_build_const_vec(bld->gallivm, type,
   1747                                   (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
   1748 
   1749       if (type.sign) {
   1750          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   1751                                 (unsigned long long)1 << (type.width - 1));
   1752          LLVMValueRef sign;
   1753 
   1754          /* sign = a < 0 ? 0 : ~0 */
   1755          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1756          sign = LLVMBuildAnd(builder, sign, mask, "");
   1757          sign = LLVMBuildAShr(builder, sign,
   1758                               lp_build_const_int_vec(bld->gallivm, type,
   1759                                                      type.width - 1),
   1760                               "iceil.sign");
   1761          sign = LLVMBuildNot(builder, sign, "iceil.not");
   1762 
   1763          /* offset = a < 0 ? 0.0 : offset */
   1764          offset = LLVMConstBitCast(offset, int_vec_type);
   1765          offset = LLVMBuildAnd(builder, offset, sign, "");
   1766          offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
   1767       }
   1768 
   1769       res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
   1770    }
   1771 
   1772    /* round to nearest (toward zero) */
   1773    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
   1774 
   1775    return res;
   1776 }
   1777 
   1778 
   1779 /**
   1780  * Combined ifloor() & fract().
   1781  *
   1782  * Preferred to calling the functions separately, as it will ensure that the
   1783  * strategy (floor() vs ifloor()) that results in less redundant work is used.
   1784  */
   1785 void
   1786 lp_build_ifloor_fract(struct lp_build_context *bld,
   1787                       LLVMValueRef a,
   1788                       LLVMValueRef *out_ipart,
   1789                       LLVMValueRef *out_fpart)
   1790 {
   1791    LLVMBuilderRef builder = bld->gallivm->builder;
   1792    const struct lp_type type = bld->type;
   1793    LLVMValueRef ipart;
   1794 
   1795    assert(type.floating);
   1796    assert(lp_check_value(type, a));
   1797 
   1798    if (sse41_rounding_available(type)) {
   1799       /*
   1800        * floor() is easier.
   1801        */
   1802 
   1803       ipart = lp_build_floor(bld, a);
   1804       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
   1805       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
   1806    }
   1807    else {
   1808       /*
   1809        * ifloor() is easier.
   1810        */
   1811 
   1812       *out_ipart = lp_build_ifloor(bld, a);
   1813       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
   1814       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
   1815    }
   1816 }
   1817 
   1818 
   1819 /**
   1820  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
   1821  * always smaller than one.
   1822  */
   1823 void
   1824 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
   1825                            LLVMValueRef a,
   1826                            LLVMValueRef *out_ipart,
   1827                            LLVMValueRef *out_fpart)
   1828 {
   1829    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
   1830    *out_fpart = clamp_fract(bld, *out_fpart);
   1831 }
   1832 
   1833 
   1834 LLVMValueRef
   1835 lp_build_sqrt(struct lp_build_context *bld,
   1836               LLVMValueRef a)
   1837 {
   1838    LLVMBuilderRef builder = bld->gallivm->builder;
   1839    const struct lp_type type = bld->type;
   1840    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1841    char intrinsic[32];
   1842 
   1843    assert(lp_check_value(type, a));
   1844 
   1845    /* TODO: optimize the constant case */
   1846 
   1847    assert(type.floating);
   1848    if (type.length == 1) {
   1849       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
   1850    }
   1851    else {
   1852       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
   1853    }
   1854 
   1855    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   1856 }
   1857 
   1858 
   1859 /**
   1860  * Do one Newton-Raphson step to improve reciprocate precision:
   1861  *
   1862  *   x_{i+1} = x_i * (2 - a * x_i)
   1863  *
   1864  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
   1865  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
   1866  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
   1867  * halo. It would be necessary to clamp the argument to prevent this.
   1868  *
   1869  * See also:
   1870  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
   1871  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
   1872  */
   1873 static INLINE LLVMValueRef
   1874 lp_build_rcp_refine(struct lp_build_context *bld,
   1875                     LLVMValueRef a,
   1876                     LLVMValueRef rcp_a)
   1877 {
   1878    LLVMBuilderRef builder = bld->gallivm->builder;
   1879    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
   1880    LLVMValueRef res;
   1881 
   1882    res = LLVMBuildFMul(builder, a, rcp_a, "");
   1883    res = LLVMBuildFSub(builder, two, res, "");
   1884    res = LLVMBuildFMul(builder, rcp_a, res, "");
   1885 
   1886    return res;
   1887 }
   1888 
   1889 
   1890 LLVMValueRef
   1891 lp_build_rcp(struct lp_build_context *bld,
   1892              LLVMValueRef a)
   1893 {
   1894    LLVMBuilderRef builder = bld->gallivm->builder;
   1895    const struct lp_type type = bld->type;
   1896 
   1897    assert(lp_check_value(type, a));
   1898 
   1899    if(a == bld->zero)
   1900       return bld->undef;
   1901    if(a == bld->one)
   1902       return bld->one;
   1903    if(a == bld->undef)
   1904       return bld->undef;
   1905 
   1906    assert(type.floating);
   1907 
   1908    if(LLVMIsConstant(a))
   1909       return LLVMConstFDiv(bld->one, a);
   1910 
   1911    /*
   1912     * We don't use RCPPS because:
   1913     * - it only has 10bits of precision
   1914     * - it doesn't even get the reciprocate of 1.0 exactly
   1915     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
   1916     * - for recent processors the benefit over DIVPS is marginal, a case
   1917     *   dependent
   1918     *
   1919     * We could still use it on certain processors if benchmarks show that the
   1920     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
   1921     * particular uses that require less workarounds.
   1922     */
   1923 
   1924    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   1925          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
   1926       const unsigned num_iterations = 0;
   1927       LLVMValueRef res;
   1928       unsigned i;
   1929       const char *intrinsic = NULL;
   1930 
   1931       if (type.length == 4) {
   1932          intrinsic = "llvm.x86.sse.rcp.ps";
   1933       }
   1934       else {
   1935          intrinsic = "llvm.x86.avx.rcp.ps.256";
   1936       }
   1937 
   1938       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   1939 
   1940       for (i = 0; i < num_iterations; ++i) {
   1941          res = lp_build_rcp_refine(bld, a, res);
   1942       }
   1943 
   1944       return res;
   1945    }
   1946 
   1947    return LLVMBuildFDiv(builder, bld->one, a, "");
   1948 }
   1949 
   1950 
   1951 /**
   1952  * Do one Newton-Raphson step to improve rsqrt precision:
   1953  *
   1954  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
   1955  *
   1956  * See also:
   1957  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
   1958  */
   1959 static INLINE LLVMValueRef
   1960 lp_build_rsqrt_refine(struct lp_build_context *bld,
   1961                       LLVMValueRef a,
   1962                       LLVMValueRef rsqrt_a)
   1963 {
   1964    LLVMBuilderRef builder = bld->gallivm->builder;
   1965    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
   1966    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
   1967    LLVMValueRef res;
   1968 
   1969    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
   1970    res = LLVMBuildFMul(builder, a, res, "");
   1971    res = LLVMBuildFSub(builder, three, res, "");
   1972    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
   1973    res = LLVMBuildFMul(builder, half, res, "");
   1974 
   1975    return res;
   1976 }
   1977 
   1978 
   1979 /**
   1980  * Generate 1/sqrt(a)
   1981  */
   1982 LLVMValueRef
   1983 lp_build_rsqrt(struct lp_build_context *bld,
   1984                LLVMValueRef a)
   1985 {
   1986    LLVMBuilderRef builder = bld->gallivm->builder;
   1987    const struct lp_type type = bld->type;
   1988 
   1989    assert(lp_check_value(type, a));
   1990 
   1991    assert(type.floating);
   1992 
   1993    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   1994         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
   1995       const unsigned num_iterations = 1;
   1996       LLVMValueRef res;
   1997       unsigned i;
   1998       const char *intrinsic = NULL;
   1999 
   2000       if (type.length == 4) {
   2001          intrinsic = "llvm.x86.sse.rsqrt.ps";
   2002       }
   2003       else {
   2004          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
   2005       }
   2006 
   2007       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2008 
   2009 
   2010       for (i = 0; i < num_iterations; ++i) {
   2011          res = lp_build_rsqrt_refine(bld, a, res);
   2012       }
   2013 
   2014       return res;
   2015    }
   2016 
   2017    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
   2018 }
   2019 
   2020 
   2021 /**
   2022  * Generate sin(a) using SSE2
   2023  */
   2024 LLVMValueRef
   2025 lp_build_sin(struct lp_build_context *bld,
   2026              LLVMValueRef a)
   2027 {
   2028    struct gallivm_state *gallivm = bld->gallivm;
   2029    LLVMBuilderRef builder = gallivm->builder;
   2030    struct lp_type int_type = lp_int_type(bld->type);
   2031    LLVMBuilderRef b = builder;
   2032 
   2033    /*
   2034     *  take the absolute value,
   2035     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
   2036     */
   2037 
   2038    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
   2039    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
   2040 
   2041    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
   2042    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
   2043 
   2044    /*
   2045     * extract the sign bit (upper one)
   2046     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
   2047     */
   2048    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
   2049    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
   2050 
   2051    /*
   2052     * scale by 4/Pi
   2053     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
   2054     */
   2055 
   2056    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
   2057    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
   2058 
   2059    /*
   2060     * store the integer part of y in mm0
   2061     * emm2 = _mm_cvttps_epi32(y);
   2062     */
   2063 
   2064    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
   2065 
   2066    /*
   2067     * j=(j+1) & (~1) (see the cephes sources)
   2068     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
   2069     */
   2070 
   2071    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
   2072    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
   2073    /*
   2074     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
   2075     */
   2076    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
   2077    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
   2078 
   2079    /*
   2080     * y = _mm_cvtepi32_ps(emm2);
   2081     */
   2082    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
   2083 
   2084    /* get the swap sign flag
   2085     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
   2086     */
   2087    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
   2088    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
   2089 
   2090    /*
   2091     * emm2 = _mm_slli_epi32(emm0, 29);
   2092     */
   2093    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
   2094    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
   2095 
   2096    /*
   2097     * get the polynom selection mask
   2098     * there is one polynom for 0 <= x <= Pi/4
   2099     * and another one for Pi/4<x<=Pi/2
   2100     * Both branches will be computed.
   2101     *
   2102     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
   2103     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   2104     */
   2105 
   2106    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
   2107    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
   2108    LLVMValueRef poly_mask = lp_build_compare(gallivm,
   2109                                              int_type, PIPE_FUNC_EQUAL,
   2110                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
   2111    /*
   2112     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
   2113     */
   2114    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
   2115 
   2116    /*
   2117     * _PS_CONST(minus_cephes_DP1, -0.78515625);
   2118     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
   2119     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
   2120     */
   2121    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
   2122    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
   2123    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
   2124 
   2125    /*
   2126     * The magic pass: "Extended precision modular arithmetic"
   2127     * x = ((x - y * DP1) - y * DP2) - y * DP3;
   2128     * xmm1 = _mm_mul_ps(y, xmm1);
   2129     * xmm2 = _mm_mul_ps(y, xmm2);
   2130     * xmm3 = _mm_mul_ps(y, xmm3);
   2131     */
   2132    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
   2133    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
   2134    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
   2135 
   2136    /*
   2137     * x = _mm_add_ps(x, xmm1);
   2138     * x = _mm_add_ps(x, xmm2);
   2139     * x = _mm_add_ps(x, xmm3);
   2140     */
   2141 
   2142    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
   2143    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
   2144    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
   2145 
   2146    /*
   2147     * Evaluate the first polynom  (0 <= x <= Pi/4)
   2148     *
   2149     * z = _mm_mul_ps(x,x);
   2150     */
   2151    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
   2152 
   2153    /*
   2154     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
   2155     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
   2156     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
   2157     */
   2158    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
   2159    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
   2160    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
   2161 
   2162    /*
   2163     * y = *(v4sf*)_ps_coscof_p0;
   2164     * y = _mm_mul_ps(y, z);
   2165     */
   2166    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
   2167    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
   2168    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
   2169    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
   2170    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
   2171    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
   2172 
   2173 
   2174    /*
   2175     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
   2176     * y = _mm_sub_ps(y, tmp);
   2177     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
   2178     */
   2179    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
   2180    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
   2181    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
   2182    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
   2183    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
   2184 
   2185    /*
   2186     * _PS_CONST(sincof_p0, -1.9515295891E-4);
   2187     * _PS_CONST(sincof_p1,  8.3321608736E-3);
   2188     * _PS_CONST(sincof_p2, -1.6666654611E-1);
   2189     */
   2190    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
   2191    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
   2192    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
   2193 
   2194    /*
   2195     * Evaluate the second polynom  (Pi/4 <= x <= 0)
   2196     *
   2197     * y2 = *(v4sf*)_ps_sincof_p0;
   2198     * y2 = _mm_mul_ps(y2, z);
   2199     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
   2200     * y2 = _mm_mul_ps(y2, z);
   2201     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
   2202     * y2 = _mm_mul_ps(y2, z);
   2203     * y2 = _mm_mul_ps(y2, x);
   2204     * y2 = _mm_add_ps(y2, x);
   2205     */
   2206 
   2207    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
   2208    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
   2209    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
   2210    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
   2211    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
   2212    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
   2213    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
   2214 
   2215    /*
   2216     * select the correct result from the two polynoms
   2217     * xmm3 = poly_mask;
   2218     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   2219     * y = _mm_andnot_ps(xmm3, y);
   2220     * y = _mm_add_ps(y,y2);
   2221     */
   2222    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
   2223    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
   2224    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
   2225    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
   2226    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
   2227    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
   2228    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
   2229 
   2230    /*
   2231     * update the sign
   2232     * y = _mm_xor_ps(y, sign_bit);
   2233     */
   2234    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
   2235    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
   2236    return y_result;
   2237 }
   2238 
   2239 
   2240 /**
   2241  * Generate cos(a) using SSE2
   2242  */
   2243 LLVMValueRef
   2244 lp_build_cos(struct lp_build_context *bld,
   2245              LLVMValueRef a)
   2246 {
   2247    struct gallivm_state *gallivm = bld->gallivm;
   2248    LLVMBuilderRef builder = gallivm->builder;
   2249    struct lp_type int_type = lp_int_type(bld->type);
   2250    LLVMBuilderRef b = builder;
   2251 
   2252    /*
   2253     *  take the absolute value,
   2254     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
   2255     */
   2256 
   2257    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
   2258    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
   2259 
   2260    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
   2261    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
   2262 
   2263    /*
   2264     * scale by 4/Pi
   2265     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
   2266     */
   2267 
   2268    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
   2269    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
   2270 
   2271    /*
   2272     * store the integer part of y in mm0
   2273     * emm2 = _mm_cvttps_epi32(y);
   2274     */
   2275 
   2276    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
   2277 
   2278    /*
   2279     * j=(j+1) & (~1) (see the cephes sources)
   2280     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
   2281     */
   2282 
   2283    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
   2284    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
   2285    /*
   2286     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
   2287     */
   2288    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
   2289    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
   2290 
   2291    /*
   2292     * y = _mm_cvtepi32_ps(emm2);
   2293     */
   2294    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
   2295 
   2296 
   2297    /*
   2298     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
   2299     */
   2300    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
   2301    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
   2302 
   2303 
   2304    /* get the swap sign flag
   2305     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
   2306     */
   2307    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
   2308    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
   2309    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
   2310    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
   2311 
   2312    /*
   2313     * emm2 = _mm_slli_epi32(emm0, 29);
   2314     */
   2315    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
   2316    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
   2317 
   2318    /*
   2319     * get the polynom selection mask
   2320     * there is one polynom for 0 <= x <= Pi/4
   2321     * and another one for Pi/4<x<=Pi/2
   2322     * Both branches will be computed.
   2323     *
   2324     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
   2325     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   2326     */
   2327 
   2328    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
   2329    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
   2330    LLVMValueRef poly_mask = lp_build_compare(gallivm,
   2331                                              int_type, PIPE_FUNC_EQUAL,
   2332    				             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
   2333 
   2334    /*
   2335     * _PS_CONST(minus_cephes_DP1, -0.78515625);
   2336     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
   2337     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
   2338     */
   2339    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
   2340    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
   2341    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
   2342 
   2343    /*
   2344     * The magic pass: "Extended precision modular arithmetic"
   2345     * x = ((x - y * DP1) - y * DP2) - y * DP3;
   2346     * xmm1 = _mm_mul_ps(y, xmm1);
   2347     * xmm2 = _mm_mul_ps(y, xmm2);
   2348     * xmm3 = _mm_mul_ps(y, xmm3);
   2349     */
   2350    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
   2351    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
   2352    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
   2353 
   2354    /*
   2355     * x = _mm_add_ps(x, xmm1);
   2356     * x = _mm_add_ps(x, xmm2);
   2357     * x = _mm_add_ps(x, xmm3);
   2358     */
   2359 
   2360    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
   2361    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
   2362    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
   2363 
   2364    /*
   2365     * Evaluate the first polynom  (0 <= x <= Pi/4)
   2366     *
   2367     * z = _mm_mul_ps(x,x);
   2368     */
   2369    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
   2370 
   2371    /*
   2372     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
   2373     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
   2374     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
   2375     */
   2376    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
   2377    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
   2378    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
   2379 
   2380    /*
   2381     * y = *(v4sf*)_ps_coscof_p0;
   2382     * y = _mm_mul_ps(y, z);
   2383     */
   2384    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
   2385    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
   2386    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
   2387    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
   2388    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
   2389    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
   2390 
   2391 
   2392    /*
   2393     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
   2394     * y = _mm_sub_ps(y, tmp);
   2395     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
   2396     */
   2397    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
   2398    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
   2399    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
   2400    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
   2401    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
   2402 
   2403    /*
   2404     * _PS_CONST(sincof_p0, -1.9515295891E-4);
   2405     * _PS_CONST(sincof_p1,  8.3321608736E-3);
   2406     * _PS_CONST(sincof_p2, -1.6666654611E-1);
   2407     */
   2408    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
   2409    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
   2410    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
   2411 
   2412    /*
   2413     * Evaluate the second polynom  (Pi/4 <= x <= 0)
   2414     *
   2415     * y2 = *(v4sf*)_ps_sincof_p0;
   2416     * y2 = _mm_mul_ps(y2, z);
   2417     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
   2418     * y2 = _mm_mul_ps(y2, z);
   2419     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
   2420     * y2 = _mm_mul_ps(y2, z);
   2421     * y2 = _mm_mul_ps(y2, x);
   2422     * y2 = _mm_add_ps(y2, x);
   2423     */
   2424 
   2425    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
   2426    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
   2427    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
   2428    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
   2429    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
   2430    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
   2431    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
   2432 
   2433    /*
   2434     * select the correct result from the two polynoms
   2435     * xmm3 = poly_mask;
   2436     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   2437     * y = _mm_andnot_ps(xmm3, y);
   2438     * y = _mm_add_ps(y,y2);
   2439     */
   2440    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
   2441    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
   2442    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
   2443    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
   2444    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
   2445    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
   2446 
   2447    /*
   2448     * update the sign
   2449     * y = _mm_xor_ps(y, sign_bit);
   2450     */
   2451    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
   2452    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
   2453    return y_result;
   2454 }
   2455 
   2456 
   2457 /**
   2458  * Generate pow(x, y)
   2459  */
   2460 LLVMValueRef
   2461 lp_build_pow(struct lp_build_context *bld,
   2462              LLVMValueRef x,
   2463              LLVMValueRef y)
   2464 {
   2465    /* TODO: optimize the constant case */
   2466    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   2467        LLVMIsConstant(x) && LLVMIsConstant(y)) {
   2468       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   2469                    __FUNCTION__);
   2470    }
   2471 
   2472    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
   2473 }
   2474 
   2475 
   2476 /**
   2477  * Generate exp(x)
   2478  */
   2479 LLVMValueRef
   2480 lp_build_exp(struct lp_build_context *bld,
   2481              LLVMValueRef x)
   2482 {
   2483    /* log2(e) = 1/log(2) */
   2484    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
   2485                                            1.4426950408889634);
   2486 
   2487    assert(lp_check_value(bld->type, x));
   2488 
   2489    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
   2490 }
   2491 
   2492 
   2493 /**
   2494  * Generate log(x)
   2495  */
   2496 LLVMValueRef
   2497 lp_build_log(struct lp_build_context *bld,
   2498              LLVMValueRef x)
   2499 {
   2500    /* log(2) */
   2501    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
   2502                                           0.69314718055994529);
   2503 
   2504    assert(lp_check_value(bld->type, x));
   2505 
   2506    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
   2507 }
   2508 
   2509 
   2510 /**
   2511  * Generate polynomial.
   2512  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
   2513  */
   2514 static LLVMValueRef
   2515 lp_build_polynomial(struct lp_build_context *bld,
   2516                     LLVMValueRef x,
   2517                     const double *coeffs,
   2518                     unsigned num_coeffs)
   2519 {
   2520    const struct lp_type type = bld->type;
   2521    LLVMValueRef even = NULL, odd = NULL;
   2522    LLVMValueRef x2;
   2523    unsigned i;
   2524 
   2525    assert(lp_check_value(bld->type, x));
   2526 
   2527    /* TODO: optimize the constant case */
   2528    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   2529        LLVMIsConstant(x)) {
   2530       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   2531                    __FUNCTION__);
   2532    }
   2533 
   2534    /*
   2535     * Calculate odd and even terms seperately to decrease data dependency
   2536     * Ex:
   2537     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
   2538     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
   2539     */
   2540    x2 = lp_build_mul(bld, x, x);
   2541 
   2542    for (i = num_coeffs; i--; ) {
   2543       LLVMValueRef coeff;
   2544 
   2545       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
   2546 
   2547       if (i % 2 == 0) {
   2548          if (even)
   2549             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
   2550          else
   2551             even = coeff;
   2552       } else {
   2553          if (odd)
   2554             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
   2555          else
   2556             odd = coeff;
   2557       }
   2558    }
   2559 
   2560    if (odd)
   2561       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
   2562    else if (even)
   2563       return even;
   2564    else
   2565       return bld->undef;
   2566 }
   2567 
   2568 
   2569 /**
   2570  * Minimax polynomial fit of 2**x, in range [0, 1[
   2571  */
   2572 const double lp_build_exp2_polynomial[] = {
   2573 #if EXP_POLY_DEGREE == 5
   2574    0.999999925063526176901,
   2575    0.693153073200168932794,
   2576    0.240153617044375388211,
   2577    0.0558263180532956664775,
   2578    0.00898934009049466391101,
   2579    0.00187757667519147912699
   2580 #elif EXP_POLY_DEGREE == 4
   2581    1.00000259337069434683,
   2582    0.693003834469974940458,
   2583    0.24144275689150793076,
   2584    0.0520114606103070150235,
   2585    0.0135341679161270268764
   2586 #elif EXP_POLY_DEGREE == 3
   2587    0.999925218562710312959,
   2588    0.695833540494823811697,
   2589    0.226067155427249155588,
   2590    0.0780245226406372992967
   2591 #elif EXP_POLY_DEGREE == 2
   2592    1.00172476321474503578,
   2593    0.657636275736077639316,
   2594    0.33718943461968720704
   2595 #else
   2596 #error
   2597 #endif
   2598 };
   2599 
   2600 
   2601 void
   2602 lp_build_exp2_approx(struct lp_build_context *bld,
   2603                      LLVMValueRef x,
   2604                      LLVMValueRef *p_exp2_int_part,
   2605                      LLVMValueRef *p_frac_part,
   2606                      LLVMValueRef *p_exp2)
   2607 {
   2608    LLVMBuilderRef builder = bld->gallivm->builder;
   2609    const struct lp_type type = bld->type;
   2610    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   2611    LLVMValueRef ipart = NULL;
   2612    LLVMValueRef fpart = NULL;
   2613    LLVMValueRef expipart = NULL;
   2614    LLVMValueRef expfpart = NULL;
   2615    LLVMValueRef res = NULL;
   2616 
   2617    assert(lp_check_value(bld->type, x));
   2618 
   2619    if(p_exp2_int_part || p_frac_part || p_exp2) {
   2620       /* TODO: optimize the constant case */
   2621       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   2622           LLVMIsConstant(x)) {
   2623          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   2624                       __FUNCTION__);
   2625       }
   2626 
   2627       assert(type.floating && type.width == 32);
   2628 
   2629       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
   2630       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
   2631 
   2632       /* ipart = floor(x) */
   2633       /* fpart = x - ipart */
   2634       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
   2635    }
   2636 
   2637    if(p_exp2_int_part || p_exp2) {
   2638       /* expipart = (float) (1 << ipart) */
   2639       expipart = LLVMBuildAdd(builder, ipart,
   2640                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
   2641       expipart = LLVMBuildShl(builder, expipart,
   2642                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
   2643       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
   2644    }
   2645 
   2646    if(p_exp2) {
   2647       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
   2648                                      Elements(lp_build_exp2_polynomial));
   2649 
   2650       res = LLVMBuildFMul(builder, expipart, expfpart, "");
   2651    }
   2652 
   2653    if(p_exp2_int_part)
   2654       *p_exp2_int_part = expipart;
   2655 
   2656    if(p_frac_part)
   2657       *p_frac_part = fpart;
   2658 
   2659    if(p_exp2)
   2660       *p_exp2 = res;
   2661 }
   2662 
   2663 
   2664 LLVMValueRef
   2665 lp_build_exp2(struct lp_build_context *bld,
   2666               LLVMValueRef x)
   2667 {
   2668    LLVMValueRef res;
   2669    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
   2670    return res;
   2671 }
   2672 
   2673 
   2674 /**
   2675  * Extract the exponent of a IEEE-754 floating point value.
   2676  *
   2677  * Optionally apply an integer bias.
   2678  *
   2679  * Result is an integer value with
   2680  *
   2681  *   ifloor(log2(x)) + bias
   2682  */
   2683 LLVMValueRef
   2684 lp_build_extract_exponent(struct lp_build_context *bld,
   2685                           LLVMValueRef x,
   2686                           int bias)
   2687 {
   2688    LLVMBuilderRef builder = bld->gallivm->builder;
   2689    const struct lp_type type = bld->type;
   2690    unsigned mantissa = lp_mantissa(type);
   2691    LLVMValueRef res;
   2692 
   2693    assert(type.floating);
   2694 
   2695    assert(lp_check_value(bld->type, x));
   2696 
   2697    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
   2698 
   2699    res = LLVMBuildLShr(builder, x,
   2700                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
   2701    res = LLVMBuildAnd(builder, res,
   2702                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
   2703    res = LLVMBuildSub(builder, res,
   2704                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
   2705 
   2706    return res;
   2707 }
   2708 
   2709 
   2710 /**
   2711  * Extract the mantissa of the a floating.
   2712  *
   2713  * Result is a floating point value with
   2714  *
   2715  *   x / floor(log2(x))
   2716  */
   2717 LLVMValueRef
   2718 lp_build_extract_mantissa(struct lp_build_context *bld,
   2719                           LLVMValueRef x)
   2720 {
   2721    LLVMBuilderRef builder = bld->gallivm->builder;
   2722    const struct lp_type type = bld->type;
   2723    unsigned mantissa = lp_mantissa(type);
   2724    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
   2725                                                   (1ULL << mantissa) - 1);
   2726    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
   2727    LLVMValueRef res;
   2728 
   2729    assert(lp_check_value(bld->type, x));
   2730 
   2731    assert(type.floating);
   2732 
   2733    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
   2734 
   2735    /* res = x / 2**ipart */
   2736    res = LLVMBuildAnd(builder, x, mantmask, "");
   2737    res = LLVMBuildOr(builder, res, one, "");
   2738    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
   2739 
   2740    return res;
   2741 }
   2742 
   2743 
   2744 
   2745 /**
   2746  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
   2747  * These coefficients can be generate with
   2748  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
   2749  */
   2750 const double lp_build_log2_polynomial[] = {
   2751 #if LOG_POLY_DEGREE == 5
   2752    2.88539008148777786488L,
   2753    0.961796878841293367824L,
   2754    0.577058946784739859012L,
   2755    0.412914355135828735411L,
   2756    0.308591899232910175289L,
   2757    0.352376952300281371868L,
   2758 #elif LOG_POLY_DEGREE == 4
   2759    2.88539009343309178325L,
   2760    0.961791550404184197881L,
   2761    0.577440339438736392009L,
   2762    0.403343858251329912514L,
   2763    0.406718052498846252698L,
   2764 #elif LOG_POLY_DEGREE == 3
   2765    2.88538959748872753838L,
   2766    0.961932915889597772928L,
   2767    0.571118517972136195241L,
   2768    0.493997535084709500285L,
   2769 #else
   2770 #error
   2771 #endif
   2772 };
   2773 
   2774 /**
   2775  * See http://www.devmaster.net/forums/showthread.php?p=43580
   2776  * http://en.wikipedia.org/wiki/Logarithm#Calculation
   2777  * http://www.nezumi.demon.co.uk/consult/logx.htm
   2778  */
   2779 void
   2780 lp_build_log2_approx(struct lp_build_context *bld,
   2781                      LLVMValueRef x,
   2782                      LLVMValueRef *p_exp,
   2783                      LLVMValueRef *p_floor_log2,
   2784                      LLVMValueRef *p_log2)
   2785 {
   2786    LLVMBuilderRef builder = bld->gallivm->builder;
   2787    const struct lp_type type = bld->type;
   2788    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   2789    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   2790 
   2791    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
   2792    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
   2793    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
   2794 
   2795    LLVMValueRef i = NULL;
   2796    LLVMValueRef y = NULL;
   2797    LLVMValueRef z = NULL;
   2798    LLVMValueRef exp = NULL;
   2799    LLVMValueRef mant = NULL;
   2800    LLVMValueRef logexp = NULL;
   2801    LLVMValueRef logmant = NULL;
   2802    LLVMValueRef res = NULL;
   2803 
   2804    assert(lp_check_value(bld->type, x));
   2805 
   2806    if(p_exp || p_floor_log2 || p_log2) {
   2807       /* TODO: optimize the constant case */
   2808       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   2809           LLVMIsConstant(x)) {
   2810          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   2811                       __FUNCTION__);
   2812       }
   2813 
   2814       assert(type.floating && type.width == 32);
   2815 
   2816       /*
   2817        * We don't explicitly handle denormalized numbers. They will yield a
   2818        * result in the neighbourhood of -127, which appears to be adequate
   2819        * enough.
   2820        */
   2821 
   2822       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
   2823 
   2824       /* exp = (float) exponent(x) */
   2825       exp = LLVMBuildAnd(builder, i, expmask, "");
   2826    }
   2827 
   2828    if(p_floor_log2 || p_log2) {
   2829       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
   2830       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
   2831       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
   2832    }
   2833 
   2834    if(p_log2) {
   2835       /* mant = 1 + (float) mantissa(x) */
   2836       mant = LLVMBuildAnd(builder, i, mantmask, "");
   2837       mant = LLVMBuildOr(builder, mant, one, "");
   2838       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
   2839 
   2840       /* y = (mant - 1) / (mant + 1) */
   2841       y = lp_build_div(bld,
   2842          lp_build_sub(bld, mant, bld->one),
   2843          lp_build_add(bld, mant, bld->one)
   2844       );
   2845 
   2846       /* z = y^2 */
   2847       z = lp_build_mul(bld, y, y);
   2848 
   2849       /* compute P(z) */
   2850       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
   2851                                     Elements(lp_build_log2_polynomial));
   2852 
   2853       /* logmant = y * P(z) */
   2854       logmant = lp_build_mul(bld, y, logmant);
   2855 
   2856       res = lp_build_add(bld, logmant, logexp);
   2857    }
   2858 
   2859    if(p_exp) {
   2860       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
   2861       *p_exp = exp;
   2862    }
   2863 
   2864    if(p_floor_log2)
   2865       *p_floor_log2 = logexp;
   2866 
   2867    if(p_log2)
   2868       *p_log2 = res;
   2869 }
   2870 
   2871 
   2872 LLVMValueRef
   2873 lp_build_log2(struct lp_build_context *bld,
   2874               LLVMValueRef x)
   2875 {
   2876    LLVMValueRef res;
   2877    lp_build_log2_approx(bld, x, NULL, NULL, &res);
   2878    return res;
   2879 }
   2880 
   2881 
   2882 /**
   2883  * Faster (and less accurate) log2.
   2884  *
   2885  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
   2886  *
   2887  * Piece-wise linear approximation, with exact results when x is a
   2888  * power of two.
   2889  *
   2890  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
   2891  */
   2892 LLVMValueRef
   2893 lp_build_fast_log2(struct lp_build_context *bld,
   2894                    LLVMValueRef x)
   2895 {
   2896    LLVMBuilderRef builder = bld->gallivm->builder;
   2897    LLVMValueRef ipart;
   2898    LLVMValueRef fpart;
   2899 
   2900    assert(lp_check_value(bld->type, x));
   2901 
   2902    assert(bld->type.floating);
   2903 
   2904    /* ipart = floor(log2(x)) - 1 */
   2905    ipart = lp_build_extract_exponent(bld, x, -1);
   2906    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
   2907 
   2908    /* fpart = x / 2**ipart */
   2909    fpart = lp_build_extract_mantissa(bld, x);
   2910 
   2911    /* ipart + fpart */
   2912    return LLVMBuildFAdd(builder, ipart, fpart, "");
   2913 }
   2914 
   2915 
   2916 /**
   2917  * Fast implementation of iround(log2(x)).
   2918  *
   2919  * Not an approximation -- it should give accurate results all the time.
   2920  */
   2921 LLVMValueRef
   2922 lp_build_ilog2(struct lp_build_context *bld,
   2923                LLVMValueRef x)
   2924 {
   2925    LLVMBuilderRef builder = bld->gallivm->builder;
   2926    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
   2927    LLVMValueRef ipart;
   2928 
   2929    assert(bld->type.floating);
   2930 
   2931    assert(lp_check_value(bld->type, x));
   2932 
   2933    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
   2934    x = LLVMBuildFMul(builder, x, sqrt2, "");
   2935 
   2936    /* ipart = floor(log2(x) + 0.5)  */
   2937    ipart = lp_build_extract_exponent(bld, x, 0);
   2938 
   2939    return ipart;
   2940 }
   2941 
   2942 LLVMValueRef
   2943 lp_build_mod(struct lp_build_context *bld,
   2944              LLVMValueRef x,
   2945              LLVMValueRef y)
   2946 {
   2947    LLVMBuilderRef builder = bld->gallivm->builder;
   2948    LLVMValueRef res;
   2949    const struct lp_type type = bld->type;
   2950 
   2951    assert(lp_check_value(type, x));
   2952    assert(lp_check_value(type, y));
   2953 
   2954    if (type.floating)
   2955       res = LLVMBuildFRem(builder, x, y, "");
   2956    else if (type.sign)
   2957       res = LLVMBuildSRem(builder, x, y, "");
   2958    else
   2959       res = LLVMBuildURem(builder, x, y, "");
   2960    return res;
   2961 }
   2962