Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009-2010 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 /**
     30  * @file
     31  * Helper
     32  *
     33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
     34  * notably min/max and saturated operations), and it is often necessary to
     35  * resort machine-specific intrinsics directly. The functions here hide all
     36  * these implementation details from the other modules.
     37  *
     38  * We also do simple expressions simplification here. Reasons are:
     39  * - it is very easy given we have all necessary information readily available
     40  * - LLVM optimization passes fail to simplify several vector expressions
     41  * - We often know value constraints which the optimization passes have no way
     42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
     43  *
     44  * @author Jose Fonseca <jfonseca (at) vmware.com>
     45  */
     46 
     47 
     48 #include <float.h>
     49 
     50 #include "util/u_memory.h"
     51 #include "util/u_debug.h"
     52 #include "util/u_math.h"
     53 #include "util/u_cpu_detect.h"
     54 
     55 #include "lp_bld_type.h"
     56 #include "lp_bld_const.h"
     57 #include "lp_bld_init.h"
     58 #include "lp_bld_intr.h"
     59 #include "lp_bld_logic.h"
     60 #include "lp_bld_pack.h"
     61 #include "lp_bld_debug.h"
     62 #include "lp_bld_bitarit.h"
     63 #include "lp_bld_arit.h"
     64 #include "lp_bld_flow.h"
     65 
     66 #if defined(PIPE_ARCH_SSE)
     67 #include <xmmintrin.h>
     68 #endif
     69 
     70 #ifndef _MM_DENORMALS_ZERO_MASK
     71 #define _MM_DENORMALS_ZERO_MASK 0x0040
     72 #endif
     73 
     74 #ifndef _MM_FLUSH_ZERO_MASK
     75 #define _MM_FLUSH_ZERO_MASK 0x8000
     76 #endif
     77 
     78 #define EXP_POLY_DEGREE 5
     79 
     80 #define LOG_POLY_DEGREE 4
     81 
     82 
     83 /**
     84  * Generate min(a, b)
     85  * No checks for special case values of a or b = 1 or 0 are done.
     86  * NaN's are handled according to the behavior specified by the
     87  * nan_behavior argument.
     88  */
     89 static LLVMValueRef
     90 lp_build_min_simple(struct lp_build_context *bld,
     91                     LLVMValueRef a,
     92                     LLVMValueRef b,
     93                     enum gallivm_nan_behavior nan_behavior)
     94 {
     95    const struct lp_type type = bld->type;
     96    const char *intrinsic = NULL;
     97    unsigned intr_size = 0;
     98    LLVMValueRef cond;
     99 
    100    assert(lp_check_value(type, a));
    101    assert(lp_check_value(type, b));
    102 
    103    /* TODO: optimize the constant case */
    104 
    105    if (type.floating && util_cpu_caps.has_sse) {
    106       if (type.width == 32) {
    107          if (type.length == 1) {
    108             intrinsic = "llvm.x86.sse.min.ss";
    109             intr_size = 128;
    110          }
    111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
    112             intrinsic = "llvm.x86.sse.min.ps";
    113             intr_size = 128;
    114          }
    115          else {
    116             intrinsic = "llvm.x86.avx.min.ps.256";
    117             intr_size = 256;
    118          }
    119       }
    120       if (type.width == 64 && util_cpu_caps.has_sse2) {
    121          if (type.length == 1) {
    122             intrinsic = "llvm.x86.sse2.min.sd";
    123             intr_size = 128;
    124          }
    125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
    126             intrinsic = "llvm.x86.sse2.min.pd";
    127             intr_size = 128;
    128          }
    129          else {
    130             intrinsic = "llvm.x86.avx.min.pd.256";
    131             intr_size = 256;
    132          }
    133       }
    134    }
    135    else if (type.floating && util_cpu_caps.has_altivec) {
    136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
    137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
    139                       __FUNCTION__);
    140       }
    141       if (type.width == 32 && type.length == 4) {
    142          intrinsic = "llvm.ppc.altivec.vminfp";
    143          intr_size = 128;
    144       }
    145    } else if (HAVE_LLVM < 0x0309 &&
    146               util_cpu_caps.has_avx2 && type.length > 4) {
    147       intr_size = 256;
    148       switch (type.width) {
    149       case 8:
    150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
    151          break;
    152       case 16:
    153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
    154          break;
    155       case 32:
    156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
    157          break;
    158       }
    159    } else if (HAVE_LLVM < 0x0309 &&
    160               util_cpu_caps.has_sse2 && type.length >= 2) {
    161       intr_size = 128;
    162       if ((type.width == 8 || type.width == 16) &&
    163           (type.width * type.length <= 64) &&
    164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
    165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
    166                       __FUNCTION__);
    167       }
    168       if (type.width == 8 && !type.sign) {
    169          intrinsic = "llvm.x86.sse2.pminu.b";
    170       }
    171       else if (type.width == 16 && type.sign) {
    172          intrinsic = "llvm.x86.sse2.pmins.w";
    173       }
    174       if (util_cpu_caps.has_sse4_1) {
    175          if (type.width == 8 && type.sign) {
    176             intrinsic = "llvm.x86.sse41.pminsb";
    177          }
    178          if (type.width == 16 && !type.sign) {
    179             intrinsic = "llvm.x86.sse41.pminuw";
    180          }
    181          if (type.width == 32 && !type.sign) {
    182             intrinsic = "llvm.x86.sse41.pminud";
    183          }
    184          if (type.width == 32 && type.sign) {
    185             intrinsic = "llvm.x86.sse41.pminsd";
    186          }
    187       }
    188    } else if (util_cpu_caps.has_altivec) {
    189       intr_size = 128;
    190       if (type.width == 8) {
    191          if (!type.sign) {
    192             intrinsic = "llvm.ppc.altivec.vminub";
    193          } else {
    194             intrinsic = "llvm.ppc.altivec.vminsb";
    195          }
    196       } else if (type.width == 16) {
    197          if (!type.sign) {
    198             intrinsic = "llvm.ppc.altivec.vminuh";
    199          } else {
    200             intrinsic = "llvm.ppc.altivec.vminsh";
    201          }
    202       } else if (type.width == 32) {
    203          if (!type.sign) {
    204             intrinsic = "llvm.ppc.altivec.vminuw";
    205          } else {
    206             intrinsic = "llvm.ppc.altivec.vminsw";
    207          }
    208       }
    209    }
    210 
    211    if (intrinsic) {
    212       /* We need to handle nan's for floating point numbers. If one of the
    213        * inputs is nan the other should be returned (required by both D3D10+
    214        * and OpenCL).
    215        * The sse intrinsics return the second operator in case of nan by
    216        * default so we need to special code to handle those.
    217        */
    218       if (util_cpu_caps.has_sse && type.floating &&
    219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
    220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
    221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    222          LLVMValueRef isnan, min;
    223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    224                                                    type,
    225                                                    intr_size, a, b);
    226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
    227             isnan = lp_build_isnan(bld, b);
    228             return lp_build_select(bld, isnan, a, min);
    229          } else {
    230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
    231             isnan = lp_build_isnan(bld, a);
    232             return lp_build_select(bld, isnan, a, min);
    233          }
    234       } else {
    235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    236                                                     type,
    237                                                     intr_size, a, b);
    238       }
    239    }
    240 
    241    if (type.floating) {
    242       switch (nan_behavior) {
    243       case GALLIVM_NAN_RETURN_NAN: {
    244          LLVMValueRef isnan = lp_build_isnan(bld, b);
    245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    247          return lp_build_select(bld, cond, a, b);
    248       }
    249          break;
    250       case GALLIVM_NAN_RETURN_OTHER: {
    251          LLVMValueRef isnan = lp_build_isnan(bld, a);
    252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    254          return lp_build_select(bld, cond, a, b);
    255       }
    256          break;
    257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
    258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
    259          return lp_build_select(bld, cond, a, b);
    260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
    261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
    262          return lp_build_select(bld, cond, b, a);
    263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
    264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    265          return lp_build_select(bld, cond, a, b);
    266          break;
    267       default:
    268          assert(0);
    269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    270          return lp_build_select(bld, cond, a, b);
    271       }
    272    } else {
    273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    274       return lp_build_select(bld, cond, a, b);
    275    }
    276 }
    277 
    278 
    279 LLVMValueRef
    280 lp_build_fmuladd(LLVMBuilderRef builder,
    281                  LLVMValueRef a,
    282                  LLVMValueRef b,
    283                  LLVMValueRef c)
    284 {
    285    LLVMTypeRef type = LLVMTypeOf(a);
    286    assert(type == LLVMTypeOf(b));
    287    assert(type == LLVMTypeOf(c));
    288    if (HAVE_LLVM < 0x0304) {
    289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
    290        * not supported, and instead it falls-back to a C function.
    291        */
    292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
    293    }
    294    char intrinsic[32];
    295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
    296    LLVMValueRef args[] = { a, b, c };
    297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
    298 }
    299 
    300 
    301 /**
    302  * Generate max(a, b)
    303  * No checks for special case values of a or b = 1 or 0 are done.
    304  * NaN's are handled according to the behavior specified by the
    305  * nan_behavior argument.
    306  */
    307 static LLVMValueRef
    308 lp_build_max_simple(struct lp_build_context *bld,
    309                     LLVMValueRef a,
    310                     LLVMValueRef b,
    311                     enum gallivm_nan_behavior nan_behavior)
    312 {
    313    const struct lp_type type = bld->type;
    314    const char *intrinsic = NULL;
    315    unsigned intr_size = 0;
    316    LLVMValueRef cond;
    317 
    318    assert(lp_check_value(type, a));
    319    assert(lp_check_value(type, b));
    320 
    321    /* TODO: optimize the constant case */
    322 
    323    if (type.floating && util_cpu_caps.has_sse) {
    324       if (type.width == 32) {
    325          if (type.length == 1) {
    326             intrinsic = "llvm.x86.sse.max.ss";
    327             intr_size = 128;
    328          }
    329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
    330             intrinsic = "llvm.x86.sse.max.ps";
    331             intr_size = 128;
    332          }
    333          else {
    334             intrinsic = "llvm.x86.avx.max.ps.256";
    335             intr_size = 256;
    336          }
    337       }
    338       if (type.width == 64 && util_cpu_caps.has_sse2) {
    339          if (type.length == 1) {
    340             intrinsic = "llvm.x86.sse2.max.sd";
    341             intr_size = 128;
    342          }
    343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
    344             intrinsic = "llvm.x86.sse2.max.pd";
    345             intr_size = 128;
    346          }
    347          else {
    348             intrinsic = "llvm.x86.avx.max.pd.256";
    349             intr_size = 256;
    350          }
    351       }
    352    }
    353    else if (type.floating && util_cpu_caps.has_altivec) {
    354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
    355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
    357                       __FUNCTION__);
    358       }
    359       if (type.width == 32 || type.length == 4) {
    360          intrinsic = "llvm.ppc.altivec.vmaxfp";
    361          intr_size = 128;
    362       }
    363    } else if (HAVE_LLVM < 0x0309 &&
    364               util_cpu_caps.has_avx2 && type.length > 4) {
    365       intr_size = 256;
    366       switch (type.width) {
    367       case 8:
    368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
    369          break;
    370       case 16:
    371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
    372          break;
    373       case 32:
    374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
    375          break;
    376       }
    377    } else if (HAVE_LLVM < 0x0309 &&
    378               util_cpu_caps.has_sse2 && type.length >= 2) {
    379       intr_size = 128;
    380       if ((type.width == 8 || type.width == 16) &&
    381           (type.width * type.length <= 64) &&
    382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
    383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
    384                       __FUNCTION__);
    385          }
    386       if (type.width == 8 && !type.sign) {
    387          intrinsic = "llvm.x86.sse2.pmaxu.b";
    388          intr_size = 128;
    389       }
    390       else if (type.width == 16 && type.sign) {
    391          intrinsic = "llvm.x86.sse2.pmaxs.w";
    392       }
    393       if (util_cpu_caps.has_sse4_1) {
    394          if (type.width == 8 && type.sign) {
    395             intrinsic = "llvm.x86.sse41.pmaxsb";
    396          }
    397          if (type.width == 16 && !type.sign) {
    398             intrinsic = "llvm.x86.sse41.pmaxuw";
    399          }
    400          if (type.width == 32 && !type.sign) {
    401             intrinsic = "llvm.x86.sse41.pmaxud";
    402         }
    403          if (type.width == 32 && type.sign) {
    404             intrinsic = "llvm.x86.sse41.pmaxsd";
    405          }
    406       }
    407    } else if (util_cpu_caps.has_altivec) {
    408      intr_size = 128;
    409      if (type.width == 8) {
    410        if (!type.sign) {
    411          intrinsic = "llvm.ppc.altivec.vmaxub";
    412        } else {
    413          intrinsic = "llvm.ppc.altivec.vmaxsb";
    414        }
    415      } else if (type.width == 16) {
    416        if (!type.sign) {
    417          intrinsic = "llvm.ppc.altivec.vmaxuh";
    418        } else {
    419          intrinsic = "llvm.ppc.altivec.vmaxsh";
    420        }
    421      } else if (type.width == 32) {
    422        if (!type.sign) {
    423          intrinsic = "llvm.ppc.altivec.vmaxuw";
    424        } else {
    425          intrinsic = "llvm.ppc.altivec.vmaxsw";
    426        }
    427      }
    428    }
    429 
    430    if (intrinsic) {
    431       if (util_cpu_caps.has_sse && type.floating &&
    432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
    433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
    434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
    435          LLVMValueRef isnan, max;
    436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    437                                                    type,
    438                                                    intr_size, a, b);
    439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
    440             isnan = lp_build_isnan(bld, b);
    441             return lp_build_select(bld, isnan, a, max);
    442          } else {
    443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
    444             isnan = lp_build_isnan(bld, a);
    445             return lp_build_select(bld, isnan, a, max);
    446          }
    447       } else {
    448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
    449                                                     type,
    450                                                     intr_size, a, b);
    451       }
    452    }
    453 
    454    if (type.floating) {
    455       switch (nan_behavior) {
    456       case GALLIVM_NAN_RETURN_NAN: {
    457          LLVMValueRef isnan = lp_build_isnan(bld, b);
    458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    460          return lp_build_select(bld, cond, a, b);
    461       }
    462          break;
    463       case GALLIVM_NAN_RETURN_OTHER: {
    464          LLVMValueRef isnan = lp_build_isnan(bld, a);
    465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
    467          return lp_build_select(bld, cond, a, b);
    468       }
    469          break;
    470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
    471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
    472          return lp_build_select(bld, cond, a, b);
    473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
    474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
    475          return lp_build_select(bld, cond, b, a);
    476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
    477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    478          return lp_build_select(bld, cond, a, b);
    479          break;
    480       default:
    481          assert(0);
    482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    483          return lp_build_select(bld, cond, a, b);
    484       }
    485    } else {
    486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    487       return lp_build_select(bld, cond, a, b);
    488    }
    489 }
    490 
    491 
    492 /**
    493  * Generate 1 - a, or ~a depending on bld->type.
    494  */
    495 LLVMValueRef
    496 lp_build_comp(struct lp_build_context *bld,
    497               LLVMValueRef a)
    498 {
    499    LLVMBuilderRef builder = bld->gallivm->builder;
    500    const struct lp_type type = bld->type;
    501 
    502    assert(lp_check_value(type, a));
    503 
    504    if(a == bld->one)
    505       return bld->zero;
    506    if(a == bld->zero)
    507       return bld->one;
    508 
    509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
    510       if(LLVMIsConstant(a))
    511          return LLVMConstNot(a);
    512       else
    513          return LLVMBuildNot(builder, a, "");
    514    }
    515 
    516    if(LLVMIsConstant(a))
    517       if (type.floating)
    518           return LLVMConstFSub(bld->one, a);
    519       else
    520           return LLVMConstSub(bld->one, a);
    521    else
    522       if (type.floating)
    523          return LLVMBuildFSub(builder, bld->one, a, "");
    524       else
    525          return LLVMBuildSub(builder, bld->one, a, "");
    526 }
    527 
    528 
    529 /**
    530  * Generate a + b
    531  */
    532 LLVMValueRef
    533 lp_build_add(struct lp_build_context *bld,
    534              LLVMValueRef a,
    535              LLVMValueRef b)
    536 {
    537    LLVMBuilderRef builder = bld->gallivm->builder;
    538    const struct lp_type type = bld->type;
    539    LLVMValueRef res;
    540 
    541    assert(lp_check_value(type, a));
    542    assert(lp_check_value(type, b));
    543 
    544    if (a == bld->zero)
    545       return b;
    546    if (b == bld->zero)
    547       return a;
    548    if (a == bld->undef || b == bld->undef)
    549       return bld->undef;
    550 
    551    if (type.norm) {
    552       const char *intrinsic = NULL;
    553 
    554       if (!type.sign && (a == bld->one || b == bld->one))
    555         return bld->one;
    556 
    557       if (!type.floating && !type.fixed) {
    558          if (type.width * type.length == 128) {
    559             if (util_cpu_caps.has_sse2) {
    560               if (type.width == 8)
    561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
    562               if (type.width == 16)
    563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
    564             } else if (util_cpu_caps.has_altivec) {
    565               if (type.width == 8)
    566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
    567               if (type.width == 16)
    568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
    569             }
    570          }
    571          if (type.width * type.length == 256) {
    572             if (util_cpu_caps.has_avx2) {
    573               if (type.width == 8)
    574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
    575               if (type.width == 16)
    576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
    577             }
    578          }
    579       }
    580 
    581       if (intrinsic)
    582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
    583    }
    584 
    585    if(type.norm && !type.floating && !type.fixed) {
    586       if (type.sign) {
    587          uint64_t sign = (uint64_t)1 << (type.width - 1);
    588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
    589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
    590          /* a_clamp_max is the maximum a for positive b,
    591             a_clamp_min is the minimum a for negative b. */
    592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
    595       } else {
    596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    597       }
    598    }
    599 
    600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
    601       if (type.floating)
    602          res = LLVMConstFAdd(a, b);
    603       else
    604          res = LLVMConstAdd(a, b);
    605    else
    606       if (type.floating)
    607          res = LLVMBuildFAdd(builder, a, b, "");
    608       else
    609          res = LLVMBuildAdd(builder, a, b, "");
    610 
    611    /* clamp to ceiling of 1.0 */
    612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
    613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    614 
    615    /* XXX clamp to floor of -1 or 0??? */
    616 
    617    return res;
    618 }
    619 
    620 
    621 /** Return the scalar sum of the elements of a.
    622  * Should avoid this operation whenever possible.
    623  */
    624 LLVMValueRef
    625 lp_build_horizontal_add(struct lp_build_context *bld,
    626                         LLVMValueRef a)
    627 {
    628    LLVMBuilderRef builder = bld->gallivm->builder;
    629    const struct lp_type type = bld->type;
    630    LLVMValueRef index, res;
    631    unsigned i, length;
    632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
    633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
    634    LLVMValueRef vecres, elem2;
    635 
    636    assert(lp_check_value(type, a));
    637 
    638    if (type.length == 1) {
    639       return a;
    640    }
    641 
    642    assert(!bld->type.norm);
    643 
    644    /*
    645     * for byte vectors can do much better with psadbw.
    646     * Using repeated shuffle/adds here. Note with multiple vectors
    647     * this can be done more efficiently as outlined in the intel
    648     * optimization manual.
    649     * Note: could cause data rearrangement if used with smaller element
    650     * sizes.
    651     */
    652 
    653    vecres = a;
    654    length = type.length / 2;
    655    while (length > 1) {
    656       LLVMValueRef vec1, vec2;
    657       for (i = 0; i < length; i++) {
    658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
    659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
    660       }
    661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
    662                                     LLVMConstVector(shuffles1, length), "");
    663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
    664                                     LLVMConstVector(shuffles2, length), "");
    665       if (type.floating) {
    666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
    667       }
    668       else {
    669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
    670       }
    671       length = length >> 1;
    672    }
    673 
    674    /* always have vector of size 2 here */
    675    assert(length == 1);
    676 
    677    index = lp_build_const_int32(bld->gallivm, 0);
    678    res = LLVMBuildExtractElement(builder, vecres, index, "");
    679    index = lp_build_const_int32(bld->gallivm, 1);
    680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
    681 
    682    if (type.floating)
    683       res = LLVMBuildFAdd(builder, res, elem2, "");
    684     else
    685       res = LLVMBuildAdd(builder, res, elem2, "");
    686 
    687    return res;
    688 }
    689 
    690 /**
    691  * Return the horizontal sums of 4 float vectors as a float4 vector.
    692  * This uses the technique as outlined in Intel Optimization Manual.
    693  */
    694 static LLVMValueRef
    695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
    696                             LLVMValueRef src[4])
    697 {
    698    struct gallivm_state *gallivm = bld->gallivm;
    699    LLVMBuilderRef builder = gallivm->builder;
    700    LLVMValueRef shuffles[4];
    701    LLVMValueRef tmp[4];
    702    LLVMValueRef sumtmp[2], shuftmp[2];
    703 
    704    /* lower half of regs */
    705    shuffles[0] = lp_build_const_int32(gallivm, 0);
    706    shuffles[1] = lp_build_const_int32(gallivm, 1);
    707    shuffles[2] = lp_build_const_int32(gallivm, 4);
    708    shuffles[3] = lp_build_const_int32(gallivm, 5);
    709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
    710                                    LLVMConstVector(shuffles, 4), "");
    711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
    712                                    LLVMConstVector(shuffles, 4), "");
    713 
    714    /* upper half of regs */
    715    shuffles[0] = lp_build_const_int32(gallivm, 2);
    716    shuffles[1] = lp_build_const_int32(gallivm, 3);
    717    shuffles[2] = lp_build_const_int32(gallivm, 6);
    718    shuffles[3] = lp_build_const_int32(gallivm, 7);
    719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
    720                                    LLVMConstVector(shuffles, 4), "");
    721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
    722                                    LLVMConstVector(shuffles, 4), "");
    723 
    724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
    725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
    726 
    727    shuffles[0] = lp_build_const_int32(gallivm, 0);
    728    shuffles[1] = lp_build_const_int32(gallivm, 2);
    729    shuffles[2] = lp_build_const_int32(gallivm, 4);
    730    shuffles[3] = lp_build_const_int32(gallivm, 6);
    731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
    732                                        LLVMConstVector(shuffles, 4), "");
    733 
    734    shuffles[0] = lp_build_const_int32(gallivm, 1);
    735    shuffles[1] = lp_build_const_int32(gallivm, 3);
    736    shuffles[2] = lp_build_const_int32(gallivm, 5);
    737    shuffles[3] = lp_build_const_int32(gallivm, 7);
    738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
    739                                        LLVMConstVector(shuffles, 4), "");
    740 
    741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
    742 }
    743 
    744 
    745 /*
    746  * partially horizontally add 2-4 float vectors with length nx4,
    747  * i.e. only four adjacent values in each vector will be added,
    748  * assuming values are really grouped in 4 which also determines
    749  * output order.
    750  *
    751  * Return a vector of the same length as the initial vectors,
    752  * with the excess elements (if any) being undefined.
    753  * The element order is independent of number of input vectors.
    754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
    755  * the output order thus will be
    756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
    757  */
    758 LLVMValueRef
    759 lp_build_hadd_partial4(struct lp_build_context *bld,
    760                        LLVMValueRef vectors[],
    761                        unsigned num_vecs)
    762 {
    763    struct gallivm_state *gallivm = bld->gallivm;
    764    LLVMBuilderRef builder = gallivm->builder;
    765    LLVMValueRef ret_vec;
    766    LLVMValueRef tmp[4];
    767    const char *intrinsic = NULL;
    768 
    769    assert(num_vecs >= 2 && num_vecs <= 4);
    770    assert(bld->type.floating);
    771 
    772    /* only use this with at least 2 vectors, as it is sort of expensive
    773     * (depending on cpu) and we always need two horizontal adds anyway,
    774     * so a shuffle/add approach might be better.
    775     */
    776 
    777    tmp[0] = vectors[0];
    778    tmp[1] = vectors[1];
    779 
    780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
    781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
    782 
    783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
    784        bld->type.length == 4) {
    785       intrinsic = "llvm.x86.sse3.hadd.ps";
    786    }
    787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
    788             bld->type.length == 8) {
    789       intrinsic = "llvm.x86.avx.hadd.ps.256";
    790    }
    791    if (intrinsic) {
    792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
    793                                        lp_build_vec_type(gallivm, bld->type),
    794                                        tmp[0], tmp[1]);
    795       if (num_vecs > 2) {
    796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
    797                                           lp_build_vec_type(gallivm, bld->type),
    798                                           tmp[2], tmp[3]);
    799       }
    800       else {
    801          tmp[1] = tmp[0];
    802       }
    803       return lp_build_intrinsic_binary(builder, intrinsic,
    804                                        lp_build_vec_type(gallivm, bld->type),
    805                                        tmp[0], tmp[1]);
    806    }
    807 
    808    if (bld->type.length == 4) {
    809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
    810    }
    811    else {
    812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
    813       unsigned j;
    814       unsigned num_iter = bld->type.length / 4;
    815       struct lp_type parttype = bld->type;
    816       parttype.length = 4;
    817       for (j = 0; j < num_iter; j++) {
    818          LLVMValueRef partsrc[4];
    819          unsigned i;
    820          for (i = 0; i < 4; i++) {
    821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
    822          }
    823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
    824       }
    825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
    826    }
    827    return ret_vec;
    828 }
    829 
    830 /**
    831  * Generate a - b
    832  */
    833 LLVMValueRef
    834 lp_build_sub(struct lp_build_context *bld,
    835              LLVMValueRef a,
    836              LLVMValueRef b)
    837 {
    838    LLVMBuilderRef builder = bld->gallivm->builder;
    839    const struct lp_type type = bld->type;
    840    LLVMValueRef res;
    841 
    842    assert(lp_check_value(type, a));
    843    assert(lp_check_value(type, b));
    844 
    845    if (b == bld->zero)
    846       return a;
    847    if (a == bld->undef || b == bld->undef)
    848       return bld->undef;
    849    if (a == b)
    850       return bld->zero;
    851 
    852    if (type.norm) {
    853       const char *intrinsic = NULL;
    854 
    855       if (!type.sign && b == bld->one)
    856         return bld->zero;
    857 
    858       if (!type.floating && !type.fixed) {
    859          if (type.width * type.length == 128) {
    860             if (util_cpu_caps.has_sse2) {
    861               if (type.width == 8)
    862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
    863               if (type.width == 16)
    864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
    865             } else if (util_cpu_caps.has_altivec) {
    866               if (type.width == 8)
    867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
    868               if (type.width == 16)
    869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
    870             }
    871          }
    872          if (type.width * type.length == 256) {
    873             if (util_cpu_caps.has_avx2) {
    874               if (type.width == 8)
    875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
    876               if (type.width == 16)
    877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
    878             }
    879          }
    880       }
    881 
    882       if (intrinsic)
    883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
    884    }
    885 
    886    if(type.norm && !type.floating && !type.fixed) {
    887       if (type.sign) {
    888          uint64_t sign = (uint64_t)1 << (type.width - 1);
    889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
    890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
    891          /* a_clamp_max is the maximum a for negative b,
    892             a_clamp_min is the minimum a for positive b. */
    893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
    896       } else {
    897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    898       }
    899    }
    900 
    901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
    902       if (type.floating)
    903          res = LLVMConstFSub(a, b);
    904       else
    905          res = LLVMConstSub(a, b);
    906    else
    907       if (type.floating)
    908          res = LLVMBuildFSub(builder, a, b, "");
    909       else
    910          res = LLVMBuildSub(builder, a, b, "");
    911 
    912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
    913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
    914 
    915    return res;
    916 }
    917 
    918 
    919 
    920 /**
    921  * Normalized multiplication.
    922  *
    923  * There are several approaches for (using 8-bit normalized multiplication as
    924  * an example):
    925  *
    926  * - alpha plus one
    927  *
    928  *     makes the following approximation to the division (Sree)
    929  *
    930  *       a*b/255 ~= (a*(b + 1)) >> 256
    931  *
    932  *     which is the fastest method that satisfies the following OpenGL criteria of
    933  *
    934  *       0*0 = 0 and 255*255 = 255
    935  *
    936  * - geometric series
    937  *
    938  *     takes the geometric series approximation to the division
    939  *
    940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
    941  *
    942  *     in this case just the first two terms to fit in 16bit arithmetic
    943  *
    944  *       t/255 ~= (t + (t >> 8)) >> 8
    945  *
    946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
    947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
    948  *     must be used.
    949  *
    950  * - geometric series plus rounding
    951  *
    952  *     when using a geometric series division instead of truncating the result
    953  *     use roundoff in the approximation (Jim Blinn)
    954  *
    955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
    956  *
    957  *     achieving the exact results.
    958  *
    959  *
    960  *
    961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
    962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
    963  * @sa Michael Herf, The "double blend trick", May 2000,
    964  *     http://www.stereopsis.com/doubleblend.html
    965  */
    966 LLVMValueRef
    967 lp_build_mul_norm(struct gallivm_state *gallivm,
    968                   struct lp_type wide_type,
    969                   LLVMValueRef a, LLVMValueRef b)
    970 {
    971    LLVMBuilderRef builder = gallivm->builder;
    972    struct lp_build_context bld;
    973    unsigned n;
    974    LLVMValueRef half;
    975    LLVMValueRef ab;
    976 
    977    assert(!wide_type.floating);
    978    assert(lp_check_value(wide_type, a));
    979    assert(lp_check_value(wide_type, b));
    980 
    981    lp_build_context_init(&bld, gallivm, wide_type);
    982 
    983    n = wide_type.width / 2;
    984    if (wide_type.sign) {
    985       --n;
    986    }
    987 
    988    /*
    989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
    990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
    991     */
    992 
    993    /*
    994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
    995     */
    996 
    997    ab = LLVMBuildMul(builder, a, b, "");
    998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
    999 
   1000    /*
   1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
   1002     */
   1003 
   1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
   1005    if (wide_type.sign) {
   1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
   1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
   1008       half = lp_build_select(&bld, sign, minus_half, half);
   1009    }
   1010    ab = LLVMBuildAdd(builder, ab, half, "");
   1011 
   1012    /* Final division */
   1013    ab = lp_build_shr_imm(&bld, ab, n);
   1014 
   1015    return ab;
   1016 }
   1017 
   1018 /**
   1019  * Generate a * b
   1020  */
   1021 LLVMValueRef
   1022 lp_build_mul(struct lp_build_context *bld,
   1023              LLVMValueRef a,
   1024              LLVMValueRef b)
   1025 {
   1026    LLVMBuilderRef builder = bld->gallivm->builder;
   1027    const struct lp_type type = bld->type;
   1028    LLVMValueRef shift;
   1029    LLVMValueRef res;
   1030 
   1031    assert(lp_check_value(type, a));
   1032    assert(lp_check_value(type, b));
   1033 
   1034    if(a == bld->zero)
   1035       return bld->zero;
   1036    if(a == bld->one)
   1037       return b;
   1038    if(b == bld->zero)
   1039       return bld->zero;
   1040    if(b == bld->one)
   1041       return a;
   1042    if(a == bld->undef || b == bld->undef)
   1043       return bld->undef;
   1044 
   1045    if (!type.floating && !type.fixed && type.norm) {
   1046       struct lp_type wide_type = lp_wider_type(type);
   1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
   1048 
   1049       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
   1050       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
   1051 
   1052       /* PMULLW, PSRLW, PADDW */
   1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
   1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
   1055 
   1056       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
   1057 
   1058       return ab;
   1059    }
   1060 
   1061    if(type.fixed)
   1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
   1063    else
   1064       shift = NULL;
   1065 
   1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
   1067       if (type.floating)
   1068          res = LLVMConstFMul(a, b);
   1069       else
   1070          res = LLVMConstMul(a, b);
   1071       if(shift) {
   1072          if(type.sign)
   1073             res = LLVMConstAShr(res, shift);
   1074          else
   1075             res = LLVMConstLShr(res, shift);
   1076       }
   1077    }
   1078    else {
   1079       if (type.floating)
   1080          res = LLVMBuildFMul(builder, a, b, "");
   1081       else
   1082          res = LLVMBuildMul(builder, a, b, "");
   1083       if(shift) {
   1084          if(type.sign)
   1085             res = LLVMBuildAShr(builder, res, shift, "");
   1086          else
   1087             res = LLVMBuildLShr(builder, res, shift, "");
   1088       }
   1089    }
   1090 
   1091    return res;
   1092 }
   1093 
   1094 /*
   1095  * Widening mul, valid for 32x32 bit -> 64bit only.
   1096  * Result is low 32bits, high bits returned in res_hi.
   1097  *
   1098  * Emits code that is meant to be compiled for the host CPU.
   1099  */
   1100 LLVMValueRef
   1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
   1102                          LLVMValueRef a,
   1103                          LLVMValueRef b,
   1104                          LLVMValueRef *res_hi)
   1105 {
   1106    struct gallivm_state *gallivm = bld->gallivm;
   1107    LLVMBuilderRef builder = gallivm->builder;
   1108 
   1109    assert(bld->type.width == 32);
   1110    assert(bld->type.floating == 0);
   1111    assert(bld->type.fixed == 0);
   1112    assert(bld->type.norm == 0);
   1113 
   1114    /*
   1115     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
   1116     * for x86 simd is atrocious (even if the high bits weren't required),
   1117     * trying to handle real 64bit inputs (which of course can't happen due
   1118     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
   1119     * apparently llvm does not recognize this widening mul). This includes 6
   1120     * (instead of 2) pmuludq plus extra adds and shifts
   1121     * The same story applies to signed mul, albeit fixing this requires sse41.
   1122     * https://llvm.org/bugs/show_bug.cgi?id=30845
   1123     * So, whip up our own code, albeit only for length 4 and 8 (which
   1124     * should be good enough)...
   1125     */
   1126    if ((bld->type.length == 4 || bld->type.length == 8) &&
   1127        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
   1128         util_cpu_caps.has_sse4_1)) {
   1129       const char *intrinsic = NULL;
   1130       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
   1131       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
   1132       struct lp_type type_wide = lp_wider_type(bld->type);
   1133       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
   1134       unsigned i;
   1135       for (i = 0; i < bld->type.length; i += 2) {
   1136          shuf[i] = lp_build_const_int32(gallivm, i+1);
   1137          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
   1138       }
   1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
   1140       aeven = a;
   1141       beven = b;
   1142       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
   1143       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
   1144 
   1145       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
   1146          if (bld->type.sign) {
   1147             intrinsic = "llvm.x86.avx2.pmul.dq";
   1148          } else {
   1149             intrinsic = "llvm.x86.avx2.pmulu.dq";
   1150          }
   1151          muleven = lp_build_intrinsic_binary(builder, intrinsic,
   1152                                              wider_type, aeven, beven);
   1153          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
   1154                                             wider_type, aodd, bodd);
   1155       }
   1156       else {
   1157          /* for consistent naming look elsewhere... */
   1158          if (bld->type.sign) {
   1159             intrinsic = "llvm.x86.sse41.pmuldq";
   1160          } else {
   1161             intrinsic = "llvm.x86.sse2.pmulu.dq";
   1162          }
   1163          /*
   1164           * XXX If we only have AVX but not AVX2 this is a pain.
   1165           * lp_build_intrinsic_binary_anylength() can't handle it
   1166           * (due to src and dst type not being identical).
   1167           */
   1168          if (bld->type.length == 8) {
   1169             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
   1170             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
   1171             LLVMValueRef muleven2[2], mulodd2[2];
   1172             struct lp_type type_wide_half = type_wide;
   1173             LLVMTypeRef wtype_half;
   1174             type_wide_half.length = 2;
   1175             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
   1176             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
   1177             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
   1178             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
   1179             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
   1180             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
   1181             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
   1182             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
   1183             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
   1184             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
   1185                                                     wtype_half, aevenlo, bevenlo);
   1186             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
   1187                                                    wtype_half, aoddlo, boddlo);
   1188             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
   1189                                                     wtype_half, aevenhi, bevenhi);
   1190             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
   1191                                                    wtype_half, aoddhi, boddhi);
   1192             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
   1193             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
   1194 
   1195          }
   1196          else {
   1197             muleven = lp_build_intrinsic_binary(builder, intrinsic,
   1198                                                 wider_type, aeven, beven);
   1199             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
   1200                                                wider_type, aodd, bodd);
   1201          }
   1202       }
   1203       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
   1204       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
   1205 
   1206       for (i = 0; i < bld->type.length; i += 2) {
   1207          shuf[i] = lp_build_const_int32(gallivm, i + 1);
   1208          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
   1209       }
   1210       shuf_vec = LLVMConstVector(shuf, bld->type.length);
   1211       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
   1212 
   1213       for (i = 0; i < bld->type.length; i += 2) {
   1214          shuf[i] = lp_build_const_int32(gallivm, i);
   1215          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
   1216       }
   1217       shuf_vec = LLVMConstVector(shuf, bld->type.length);
   1218       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
   1219    }
   1220    else {
   1221       return lp_build_mul_32_lohi(bld, a, b, res_hi);
   1222    }
   1223 }
   1224 
   1225 
   1226 /*
   1227  * Widening mul, valid for 32x32 bit -> 64bit only.
   1228  * Result is low 32bits, high bits returned in res_hi.
   1229  *
   1230  * Emits generic code.
   1231  */
   1232 LLVMValueRef
   1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
   1234                      LLVMValueRef a,
   1235                      LLVMValueRef b,
   1236                      LLVMValueRef *res_hi)
   1237 {
   1238    struct gallivm_state *gallivm = bld->gallivm;
   1239    LLVMBuilderRef builder = gallivm->builder;
   1240    LLVMValueRef tmp, shift, res_lo;
   1241    struct lp_type type_tmp;
   1242    LLVMTypeRef wide_type, narrow_type;
   1243 
   1244    type_tmp = bld->type;
   1245    narrow_type = lp_build_vec_type(gallivm, type_tmp);
   1246    type_tmp.width *= 2;
   1247    wide_type = lp_build_vec_type(gallivm, type_tmp);
   1248    shift = lp_build_const_vec(gallivm, type_tmp, 32);
   1249 
   1250    if (bld->type.sign) {
   1251       a = LLVMBuildSExt(builder, a, wide_type, "");
   1252       b = LLVMBuildSExt(builder, b, wide_type, "");
   1253    } else {
   1254       a = LLVMBuildZExt(builder, a, wide_type, "");
   1255       b = LLVMBuildZExt(builder, b, wide_type, "");
   1256    }
   1257    tmp = LLVMBuildMul(builder, a, b, "");
   1258 
   1259    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
   1260 
   1261    /* Since we truncate anyway, LShr and AShr are equivalent. */
   1262    tmp = LLVMBuildLShr(builder, tmp, shift, "");
   1263    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
   1264 
   1265    return res_lo;
   1266 }
   1267 
   1268 
   1269 /* a * b + c */
   1270 LLVMValueRef
   1271 lp_build_mad(struct lp_build_context *bld,
   1272              LLVMValueRef a,
   1273              LLVMValueRef b,
   1274              LLVMValueRef c)
   1275 {
   1276    const struct lp_type type = bld->type;
   1277    if (type.floating) {
   1278       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
   1279    } else {
   1280       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
   1281    }
   1282 }
   1283 
   1284 
   1285 /**
   1286  * Small vector x scale multiplication optimization.
   1287  */
   1288 LLVMValueRef
   1289 lp_build_mul_imm(struct lp_build_context *bld,
   1290                  LLVMValueRef a,
   1291                  int b)
   1292 {
   1293    LLVMBuilderRef builder = bld->gallivm->builder;
   1294    LLVMValueRef factor;
   1295 
   1296    assert(lp_check_value(bld->type, a));
   1297 
   1298    if(b == 0)
   1299       return bld->zero;
   1300 
   1301    if(b == 1)
   1302       return a;
   1303 
   1304    if(b == -1)
   1305       return lp_build_negate(bld, a);
   1306 
   1307    if(b == 2 && bld->type.floating)
   1308       return lp_build_add(bld, a, a);
   1309 
   1310    if(util_is_power_of_two(b)) {
   1311       unsigned shift = ffs(b) - 1;
   1312 
   1313       if(bld->type.floating) {
   1314 #if 0
   1315          /*
   1316           * Power of two multiplication by directly manipulating the exponent.
   1317           *
   1318           * XXX: This might not be always faster, it will introduce a small error
   1319           * for multiplication by zero, and it will produce wrong results
   1320           * for Inf and NaN.
   1321           */
   1322          unsigned mantissa = lp_mantissa(bld->type);
   1323          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
   1324          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
   1325          a = LLVMBuildAdd(builder, a, factor, "");
   1326          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
   1327          return a;
   1328 #endif
   1329       }
   1330       else {
   1331          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
   1332          return LLVMBuildShl(builder, a, factor, "");
   1333       }
   1334    }
   1335 
   1336    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
   1337    return lp_build_mul(bld, a, factor);
   1338 }
   1339 
   1340 
   1341 /**
   1342  * Generate a / b
   1343  */
   1344 LLVMValueRef
   1345 lp_build_div(struct lp_build_context *bld,
   1346              LLVMValueRef a,
   1347              LLVMValueRef b)
   1348 {
   1349    LLVMBuilderRef builder = bld->gallivm->builder;
   1350    const struct lp_type type = bld->type;
   1351 
   1352    assert(lp_check_value(type, a));
   1353    assert(lp_check_value(type, b));
   1354 
   1355    if(a == bld->zero)
   1356       return bld->zero;
   1357    if(a == bld->one && type.floating)
   1358       return lp_build_rcp(bld, b);
   1359    if(b == bld->zero)
   1360       return bld->undef;
   1361    if(b == bld->one)
   1362       return a;
   1363    if(a == bld->undef || b == bld->undef)
   1364       return bld->undef;
   1365 
   1366    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
   1367       if (type.floating)
   1368          return LLVMConstFDiv(a, b);
   1369       else if (type.sign)
   1370          return LLVMConstSDiv(a, b);
   1371       else
   1372          return LLVMConstUDiv(a, b);
   1373    }
   1374 
   1375    /* fast rcp is disabled (just uses div), so makes no sense to try that */
   1376    if(FALSE &&
   1377       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   1378        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
   1379       type.floating)
   1380       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
   1381 
   1382    if (type.floating)
   1383       return LLVMBuildFDiv(builder, a, b, "");
   1384    else if (type.sign)
   1385       return LLVMBuildSDiv(builder, a, b, "");
   1386    else
   1387       return LLVMBuildUDiv(builder, a, b, "");
   1388 }
   1389 
   1390 
   1391 /**
   1392  * Linear interpolation helper.
   1393  *
   1394  * @param normalized whether we are interpolating normalized values,
   1395  *        encoded in normalized integers, twice as wide.
   1396  *
   1397  * @sa http://www.stereopsis.com/doubleblend.html
   1398  */
   1399 static inline LLVMValueRef
   1400 lp_build_lerp_simple(struct lp_build_context *bld,
   1401                      LLVMValueRef x,
   1402                      LLVMValueRef v0,
   1403                      LLVMValueRef v1,
   1404                      unsigned flags)
   1405 {
   1406    unsigned half_width = bld->type.width/2;
   1407    LLVMBuilderRef builder = bld->gallivm->builder;
   1408    LLVMValueRef delta;
   1409    LLVMValueRef res;
   1410 
   1411    assert(lp_check_value(bld->type, x));
   1412    assert(lp_check_value(bld->type, v0));
   1413    assert(lp_check_value(bld->type, v1));
   1414 
   1415    delta = lp_build_sub(bld, v1, v0);
   1416 
   1417    if (bld->type.floating) {
   1418       assert(flags == 0);
   1419       return lp_build_mad(bld, x, delta, v0);
   1420    }
   1421 
   1422    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
   1423       if (!bld->type.sign) {
   1424          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
   1425             /*
   1426              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
   1427              * most-significant-bit to the lowest-significant-bit, so that
   1428              * later we can just divide by 2**n instead of 2**n - 1.
   1429              */
   1430 
   1431             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
   1432          }
   1433 
   1434          /* (x * delta) >> n */
   1435          res = lp_build_mul(bld, x, delta);
   1436          res = lp_build_shr_imm(bld, res, half_width);
   1437       } else {
   1438          /*
   1439           * The rescaling trick above doesn't work for signed numbers, so
   1440           * use the 2**n - 1 divison approximation in lp_build_mul_norm
   1441           * instead.
   1442           */
   1443          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
   1444          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
   1445       }
   1446    } else {
   1447       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
   1448       res = lp_build_mul(bld, x, delta);
   1449    }
   1450 
   1451    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
   1452       /*
   1453        * At this point both res and v0 only use the lower half of the bits,
   1454        * the rest is zero. Instead of add / mask, do add with half wide type.
   1455        */
   1456       struct lp_type narrow_type;
   1457       struct lp_build_context narrow_bld;
   1458 
   1459       memset(&narrow_type, 0, sizeof narrow_type);
   1460       narrow_type.sign   = bld->type.sign;
   1461       narrow_type.width  = bld->type.width/2;
   1462       narrow_type.length = bld->type.length*2;
   1463 
   1464       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
   1465       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
   1466       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
   1467       res = lp_build_add(&narrow_bld, v0, res);
   1468       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
   1469    } else {
   1470       res = lp_build_add(bld, v0, res);
   1471 
   1472       if (bld->type.fixed) {
   1473          /*
   1474           * We need to mask out the high order bits when lerping 8bit
   1475           * normalized colors stored on 16bits
   1476           */
   1477          /* XXX: This step is necessary for lerping 8bit colors stored on
   1478           * 16bits, but it will be wrong for true fixed point use cases.
   1479           * Basically we need a more powerful lp_type, capable of further
   1480           * distinguishing the values interpretation from the value storage.
   1481           */
   1482          LLVMValueRef low_bits;
   1483          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
   1484          res = LLVMBuildAnd(builder, res, low_bits, "");
   1485       }
   1486    }
   1487 
   1488    return res;
   1489 }
   1490 
   1491 
   1492 /**
   1493  * Linear interpolation.
   1494  */
   1495 LLVMValueRef
   1496 lp_build_lerp(struct lp_build_context *bld,
   1497               LLVMValueRef x,
   1498               LLVMValueRef v0,
   1499               LLVMValueRef v1,
   1500               unsigned flags)
   1501 {
   1502    const struct lp_type type = bld->type;
   1503    LLVMValueRef res;
   1504 
   1505    assert(lp_check_value(type, x));
   1506    assert(lp_check_value(type, v0));
   1507    assert(lp_check_value(type, v1));
   1508 
   1509    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
   1510 
   1511    if (type.norm) {
   1512       struct lp_type wide_type;
   1513       struct lp_build_context wide_bld;
   1514       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
   1515 
   1516       assert(type.length >= 2);
   1517 
   1518       /*
   1519        * Create a wider integer type, enough to hold the
   1520        * intermediate result of the multiplication.
   1521        */
   1522       memset(&wide_type, 0, sizeof wide_type);
   1523       wide_type.sign   = type.sign;
   1524       wide_type.width  = type.width*2;
   1525       wide_type.length = type.length/2;
   1526 
   1527       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
   1528 
   1529       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
   1530       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
   1531       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
   1532 
   1533       /*
   1534        * Lerp both halves.
   1535        */
   1536 
   1537       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
   1538 
   1539       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
   1540       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
   1541 
   1542       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
   1543    } else {
   1544       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
   1545    }
   1546 
   1547    return res;
   1548 }
   1549 
   1550 
   1551 /**
   1552  * Bilinear interpolation.
   1553  *
   1554  * Values indices are in v_{yx}.
   1555  */
   1556 LLVMValueRef
   1557 lp_build_lerp_2d(struct lp_build_context *bld,
   1558                  LLVMValueRef x,
   1559                  LLVMValueRef y,
   1560                  LLVMValueRef v00,
   1561                  LLVMValueRef v01,
   1562                  LLVMValueRef v10,
   1563                  LLVMValueRef v11,
   1564                  unsigned flags)
   1565 {
   1566    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
   1567    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
   1568    return lp_build_lerp(bld, y, v0, v1, flags);
   1569 }
   1570 
   1571 
   1572 LLVMValueRef
   1573 lp_build_lerp_3d(struct lp_build_context *bld,
   1574                  LLVMValueRef x,
   1575                  LLVMValueRef y,
   1576                  LLVMValueRef z,
   1577                  LLVMValueRef v000,
   1578                  LLVMValueRef v001,
   1579                  LLVMValueRef v010,
   1580                  LLVMValueRef v011,
   1581                  LLVMValueRef v100,
   1582                  LLVMValueRef v101,
   1583                  LLVMValueRef v110,
   1584                  LLVMValueRef v111,
   1585                  unsigned flags)
   1586 {
   1587    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
   1588    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
   1589    return lp_build_lerp(bld, z, v0, v1, flags);
   1590 }
   1591 
   1592 
   1593 /**
   1594  * Generate min(a, b)
   1595  * Do checks for special cases but not for nans.
   1596  */
   1597 LLVMValueRef
   1598 lp_build_min(struct lp_build_context *bld,
   1599              LLVMValueRef a,
   1600              LLVMValueRef b)
   1601 {
   1602    assert(lp_check_value(bld->type, a));
   1603    assert(lp_check_value(bld->type, b));
   1604 
   1605    if(a == bld->undef || b == bld->undef)
   1606       return bld->undef;
   1607 
   1608    if(a == b)
   1609       return a;
   1610 
   1611    if (bld->type.norm) {
   1612       if (!bld->type.sign) {
   1613          if (a == bld->zero || b == bld->zero) {
   1614             return bld->zero;
   1615          }
   1616       }
   1617       if(a == bld->one)
   1618          return b;
   1619       if(b == bld->one)
   1620          return a;
   1621    }
   1622 
   1623    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
   1624 }
   1625 
   1626 
   1627 /**
   1628  * Generate min(a, b)
   1629  * NaN's are handled according to the behavior specified by the
   1630  * nan_behavior argument.
   1631  */
   1632 LLVMValueRef
   1633 lp_build_min_ext(struct lp_build_context *bld,
   1634                  LLVMValueRef a,
   1635                  LLVMValueRef b,
   1636                  enum gallivm_nan_behavior nan_behavior)
   1637 {
   1638    assert(lp_check_value(bld->type, a));
   1639    assert(lp_check_value(bld->type, b));
   1640 
   1641    if(a == bld->undef || b == bld->undef)
   1642       return bld->undef;
   1643 
   1644    if(a == b)
   1645       return a;
   1646 
   1647    if (bld->type.norm) {
   1648       if (!bld->type.sign) {
   1649          if (a == bld->zero || b == bld->zero) {
   1650             return bld->zero;
   1651          }
   1652       }
   1653       if(a == bld->one)
   1654          return b;
   1655       if(b == bld->one)
   1656          return a;
   1657    }
   1658 
   1659    return lp_build_min_simple(bld, a, b, nan_behavior);
   1660 }
   1661 
   1662 /**
   1663  * Generate max(a, b)
   1664  * Do checks for special cases, but NaN behavior is undefined.
   1665  */
   1666 LLVMValueRef
   1667 lp_build_max(struct lp_build_context *bld,
   1668              LLVMValueRef a,
   1669              LLVMValueRef b)
   1670 {
   1671    assert(lp_check_value(bld->type, a));
   1672    assert(lp_check_value(bld->type, b));
   1673 
   1674    if(a == bld->undef || b == bld->undef)
   1675       return bld->undef;
   1676 
   1677    if(a == b)
   1678       return a;
   1679 
   1680    if(bld->type.norm) {
   1681       if(a == bld->one || b == bld->one)
   1682          return bld->one;
   1683       if (!bld->type.sign) {
   1684          if (a == bld->zero) {
   1685             return b;
   1686          }
   1687          if (b == bld->zero) {
   1688             return a;
   1689          }
   1690       }
   1691    }
   1692 
   1693    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
   1694 }
   1695 
   1696 
   1697 /**
   1698  * Generate max(a, b)
   1699  * Checks for special cases.
   1700  * NaN's are handled according to the behavior specified by the
   1701  * nan_behavior argument.
   1702  */
   1703 LLVMValueRef
   1704 lp_build_max_ext(struct lp_build_context *bld,
   1705                   LLVMValueRef a,
   1706                   LLVMValueRef b,
   1707                   enum gallivm_nan_behavior nan_behavior)
   1708 {
   1709    assert(lp_check_value(bld->type, a));
   1710    assert(lp_check_value(bld->type, b));
   1711 
   1712    if(a == bld->undef || b == bld->undef)
   1713       return bld->undef;
   1714 
   1715    if(a == b)
   1716       return a;
   1717 
   1718    if(bld->type.norm) {
   1719       if(a == bld->one || b == bld->one)
   1720          return bld->one;
   1721       if (!bld->type.sign) {
   1722          if (a == bld->zero) {
   1723             return b;
   1724          }
   1725          if (b == bld->zero) {
   1726             return a;
   1727          }
   1728       }
   1729    }
   1730 
   1731    return lp_build_max_simple(bld, a, b, nan_behavior);
   1732 }
   1733 
   1734 /**
   1735  * Generate clamp(a, min, max)
   1736  * NaN behavior (for any of a, min, max) is undefined.
   1737  * Do checks for special cases.
   1738  */
   1739 LLVMValueRef
   1740 lp_build_clamp(struct lp_build_context *bld,
   1741                LLVMValueRef a,
   1742                LLVMValueRef min,
   1743                LLVMValueRef max)
   1744 {
   1745    assert(lp_check_value(bld->type, a));
   1746    assert(lp_check_value(bld->type, min));
   1747    assert(lp_check_value(bld->type, max));
   1748 
   1749    a = lp_build_min(bld, a, max);
   1750    a = lp_build_max(bld, a, min);
   1751    return a;
   1752 }
   1753 
   1754 
   1755 /**
   1756  * Generate clamp(a, 0, 1)
   1757  * A NaN will get converted to zero.
   1758  */
   1759 LLVMValueRef
   1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
   1761                                 LLVMValueRef a)
   1762 {
   1763    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   1764    a = lp_build_min(bld, a, bld->one);
   1765    return a;
   1766 }
   1767 
   1768 
   1769 /**
   1770  * Generate abs(a)
   1771  */
   1772 LLVMValueRef
   1773 lp_build_abs(struct lp_build_context *bld,
   1774              LLVMValueRef a)
   1775 {
   1776    LLVMBuilderRef builder = bld->gallivm->builder;
   1777    const struct lp_type type = bld->type;
   1778    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1779 
   1780    assert(lp_check_value(type, a));
   1781 
   1782    if(!type.sign)
   1783       return a;
   1784 
   1785    if(type.floating) {
   1786       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
   1787          /* Workaround llvm.org/PR27332 */
   1788          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1789          unsigned long long absMask = ~(1ULL << (type.width - 1));
   1790          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
   1791          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1792          a = LLVMBuildAnd(builder, a, mask, "");
   1793          a = LLVMBuildBitCast(builder, a, vec_type, "");
   1794          return a;
   1795       } else {
   1796          char intrinsic[32];
   1797          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
   1798          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   1799       }
   1800    }
   1801 
   1802    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
   1803       switch(type.width) {
   1804       case 8:
   1805          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
   1806       case 16:
   1807          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
   1808       case 32:
   1809          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
   1810       }
   1811    }
   1812    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
   1813       switch(type.width) {
   1814       case 8:
   1815          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
   1816       case 16:
   1817          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
   1818       case 32:
   1819          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
   1820       }
   1821    }
   1822 
   1823    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
   1824                           a, LLVMBuildNeg(builder, a, ""));
   1825 }
   1826 
   1827 
   1828 LLVMValueRef
   1829 lp_build_negate(struct lp_build_context *bld,
   1830                 LLVMValueRef a)
   1831 {
   1832    LLVMBuilderRef builder = bld->gallivm->builder;
   1833 
   1834    assert(lp_check_value(bld->type, a));
   1835 
   1836    if (bld->type.floating)
   1837       a = LLVMBuildFNeg(builder, a, "");
   1838    else
   1839       a = LLVMBuildNeg(builder, a, "");
   1840 
   1841    return a;
   1842 }
   1843 
   1844 
   1845 /** Return -1, 0 or +1 depending on the sign of a */
   1846 LLVMValueRef
   1847 lp_build_sgn(struct lp_build_context *bld,
   1848              LLVMValueRef a)
   1849 {
   1850    LLVMBuilderRef builder = bld->gallivm->builder;
   1851    const struct lp_type type = bld->type;
   1852    LLVMValueRef cond;
   1853    LLVMValueRef res;
   1854 
   1855    assert(lp_check_value(type, a));
   1856 
   1857    /* Handle non-zero case */
   1858    if(!type.sign) {
   1859       /* if not zero then sign must be positive */
   1860       res = bld->one;
   1861    }
   1862    else if(type.floating) {
   1863       LLVMTypeRef vec_type;
   1864       LLVMTypeRef int_type;
   1865       LLVMValueRef mask;
   1866       LLVMValueRef sign;
   1867       LLVMValueRef one;
   1868       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
   1869 
   1870       int_type = lp_build_int_vec_type(bld->gallivm, type);
   1871       vec_type = lp_build_vec_type(bld->gallivm, type);
   1872       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
   1873 
   1874       /* Take the sign bit and add it to 1 constant */
   1875       sign = LLVMBuildBitCast(builder, a, int_type, "");
   1876       sign = LLVMBuildAnd(builder, sign, mask, "");
   1877       one = LLVMConstBitCast(bld->one, int_type);
   1878       res = LLVMBuildOr(builder, sign, one, "");
   1879       res = LLVMBuildBitCast(builder, res, vec_type, "");
   1880    }
   1881    else
   1882    {
   1883       /* signed int/norm/fixed point */
   1884       /* could use psign with sse3 and appropriate vectors here */
   1885       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
   1886       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
   1887       res = lp_build_select(bld, cond, bld->one, minus_one);
   1888    }
   1889 
   1890    /* Handle zero */
   1891    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
   1892    res = lp_build_select(bld, cond, bld->zero, res);
   1893 
   1894    return res;
   1895 }
   1896 
   1897 
   1898 /**
   1899  * Set the sign of float vector 'a' according to 'sign'.
   1900  * If sign==0, return abs(a).
   1901  * If sign==1, return -abs(a);
   1902  * Other values for sign produce undefined results.
   1903  */
   1904 LLVMValueRef
   1905 lp_build_set_sign(struct lp_build_context *bld,
   1906                   LLVMValueRef a, LLVMValueRef sign)
   1907 {
   1908    LLVMBuilderRef builder = bld->gallivm->builder;
   1909    const struct lp_type type = bld->type;
   1910    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   1911    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1912    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
   1913    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   1914                              ~((unsigned long long) 1 << (type.width - 1)));
   1915    LLVMValueRef val, res;
   1916 
   1917    assert(type.floating);
   1918    assert(lp_check_value(type, a));
   1919 
   1920    /* val = reinterpret_cast<int>(a) */
   1921    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
   1922    /* val = val & mask */
   1923    val = LLVMBuildAnd(builder, val, mask, "");
   1924    /* sign = sign << shift */
   1925    sign = LLVMBuildShl(builder, sign, shift, "");
   1926    /* res = val | sign */
   1927    res = LLVMBuildOr(builder, val, sign, "");
   1928    /* res = reinterpret_cast<float>(res) */
   1929    res = LLVMBuildBitCast(builder, res, vec_type, "");
   1930 
   1931    return res;
   1932 }
   1933 
   1934 
   1935 /**
   1936  * Convert vector of (or scalar) int to vector of (or scalar) float.
   1937  */
   1938 LLVMValueRef
   1939 lp_build_int_to_float(struct lp_build_context *bld,
   1940                       LLVMValueRef a)
   1941 {
   1942    LLVMBuilderRef builder = bld->gallivm->builder;
   1943    const struct lp_type type = bld->type;
   1944    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   1945 
   1946    assert(type.floating);
   1947 
   1948    return LLVMBuildSIToFP(builder, a, vec_type, "");
   1949 }
   1950 
   1951 static boolean
   1952 arch_rounding_available(const struct lp_type type)
   1953 {
   1954    if ((util_cpu_caps.has_sse4_1 &&
   1955        (type.length == 1 || type.width*type.length == 128)) ||
   1956        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
   1957        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
   1958       return TRUE;
   1959    else if ((util_cpu_caps.has_altivec &&
   1960             (type.width == 32 && type.length == 4)))
   1961       return TRUE;
   1962 
   1963    return FALSE;
   1964 }
   1965 
   1966 enum lp_build_round_mode
   1967 {
   1968    LP_BUILD_ROUND_NEAREST = 0,
   1969    LP_BUILD_ROUND_FLOOR = 1,
   1970    LP_BUILD_ROUND_CEIL = 2,
   1971    LP_BUILD_ROUND_TRUNCATE = 3
   1972 };
   1973 
   1974 static inline LLVMValueRef
   1975 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
   1976                              LLVMValueRef a)
   1977 {
   1978    LLVMBuilderRef builder = bld->gallivm->builder;
   1979    const struct lp_type type = bld->type;
   1980    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   1981    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
   1982    const char *intrinsic;
   1983    LLVMValueRef res;
   1984 
   1985    assert(type.floating);
   1986    /* using the double precision conversions is a bit more complicated */
   1987    assert(type.width == 32);
   1988 
   1989    assert(lp_check_value(type, a));
   1990    assert(util_cpu_caps.has_sse2);
   1991 
   1992    /* This is relying on MXCSR rounding mode, which should always be nearest. */
   1993    if (type.length == 1) {
   1994       LLVMTypeRef vec_type;
   1995       LLVMValueRef undef;
   1996       LLVMValueRef arg;
   1997       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
   1998 
   1999       vec_type = LLVMVectorType(bld->elem_type, 4);
   2000 
   2001       intrinsic = "llvm.x86.sse.cvtss2si";
   2002 
   2003       undef = LLVMGetUndef(vec_type);
   2004 
   2005       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
   2006 
   2007       res = lp_build_intrinsic_unary(builder, intrinsic,
   2008                                      ret_type, arg);
   2009    }
   2010    else {
   2011       if (type.width* type.length == 128) {
   2012          intrinsic = "llvm.x86.sse2.cvtps2dq";
   2013       }
   2014       else {
   2015          assert(type.width*type.length == 256);
   2016          assert(util_cpu_caps.has_avx);
   2017 
   2018          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
   2019       }
   2020       res = lp_build_intrinsic_unary(builder, intrinsic,
   2021                                      ret_type, a);
   2022    }
   2023 
   2024    return res;
   2025 }
   2026 
   2027 
   2028 /*
   2029  */
   2030 static inline LLVMValueRef
   2031 lp_build_round_altivec(struct lp_build_context *bld,
   2032                        LLVMValueRef a,
   2033                        enum lp_build_round_mode mode)
   2034 {
   2035    LLVMBuilderRef builder = bld->gallivm->builder;
   2036    const struct lp_type type = bld->type;
   2037    const char *intrinsic = NULL;
   2038 
   2039    assert(type.floating);
   2040 
   2041    assert(lp_check_value(type, a));
   2042    assert(util_cpu_caps.has_altivec);
   2043 
   2044    (void)type;
   2045 
   2046    switch (mode) {
   2047    case LP_BUILD_ROUND_NEAREST:
   2048       intrinsic = "llvm.ppc.altivec.vrfin";
   2049       break;
   2050    case LP_BUILD_ROUND_FLOOR:
   2051       intrinsic = "llvm.ppc.altivec.vrfim";
   2052       break;
   2053    case LP_BUILD_ROUND_CEIL:
   2054       intrinsic = "llvm.ppc.altivec.vrfip";
   2055       break;
   2056    case LP_BUILD_ROUND_TRUNCATE:
   2057       intrinsic = "llvm.ppc.altivec.vrfiz";
   2058       break;
   2059    }
   2060 
   2061    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2062 }
   2063 
   2064 static inline LLVMValueRef
   2065 lp_build_round_arch(struct lp_build_context *bld,
   2066                     LLVMValueRef a,
   2067                     enum lp_build_round_mode mode)
   2068 {
   2069    if (util_cpu_caps.has_sse4_1) {
   2070       LLVMBuilderRef builder = bld->gallivm->builder;
   2071       const struct lp_type type = bld->type;
   2072       const char *intrinsic_root;
   2073       char intrinsic[32];
   2074 
   2075       assert(type.floating);
   2076       assert(lp_check_value(type, a));
   2077       (void)type;
   2078 
   2079       switch (mode) {
   2080       case LP_BUILD_ROUND_NEAREST:
   2081          intrinsic_root = "llvm.nearbyint";
   2082          break;
   2083       case LP_BUILD_ROUND_FLOOR:
   2084          intrinsic_root = "llvm.floor";
   2085          break;
   2086       case LP_BUILD_ROUND_CEIL:
   2087          intrinsic_root = "llvm.ceil";
   2088          break;
   2089       case LP_BUILD_ROUND_TRUNCATE:
   2090          intrinsic_root = "llvm.trunc";
   2091          break;
   2092       }
   2093 
   2094       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
   2095       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2096    }
   2097    else /* (util_cpu_caps.has_altivec) */
   2098      return lp_build_round_altivec(bld, a, mode);
   2099 }
   2100 
   2101 /**
   2102  * Return the integer part of a float (vector) value (== round toward zero).
   2103  * The returned value is a float (vector).
   2104  * Ex: trunc(-1.5) = -1.0
   2105  */
   2106 LLVMValueRef
   2107 lp_build_trunc(struct lp_build_context *bld,
   2108                LLVMValueRef a)
   2109 {
   2110    LLVMBuilderRef builder = bld->gallivm->builder;
   2111    const struct lp_type type = bld->type;
   2112 
   2113    assert(type.floating);
   2114    assert(lp_check_value(type, a));
   2115 
   2116    if (arch_rounding_available(type)) {
   2117       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
   2118    }
   2119    else {
   2120       const struct lp_type type = bld->type;
   2121       struct lp_type inttype;
   2122       struct lp_build_context intbld;
   2123       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2124       LLVMValueRef trunc, res, anosign, mask;
   2125       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2126       LLVMTypeRef vec_type = bld->vec_type;
   2127 
   2128       assert(type.width == 32); /* might want to handle doubles at some point */
   2129 
   2130       inttype = type;
   2131       inttype.floating = 0;
   2132       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2133 
   2134       /* round by truncation */
   2135       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2136       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
   2137 
   2138       /* mask out sign bit */
   2139       anosign = lp_build_abs(bld, a);
   2140       /*
   2141        * mask out all values if anosign > 2^24
   2142        * This should work both for large ints (all rounding is no-op for them
   2143        * because such floats are always exact) as well as special cases like
   2144        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2145        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2146        */
   2147       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2148       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2149       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2150       return lp_build_select(bld, mask, a, res);
   2151    }
   2152 }
   2153 
   2154 
   2155 /**
   2156  * Return float (vector) rounded to nearest integer (vector).  The returned
   2157  * value is a float (vector).
   2158  * Ex: round(0.9) = 1.0
   2159  * Ex: round(-1.5) = -2.0
   2160  */
   2161 LLVMValueRef
   2162 lp_build_round(struct lp_build_context *bld,
   2163                LLVMValueRef a)
   2164 {
   2165    LLVMBuilderRef builder = bld->gallivm->builder;
   2166    const struct lp_type type = bld->type;
   2167 
   2168    assert(type.floating);
   2169    assert(lp_check_value(type, a));
   2170 
   2171    if (arch_rounding_available(type)) {
   2172       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
   2173    }
   2174    else {
   2175       const struct lp_type type = bld->type;
   2176       struct lp_type inttype;
   2177       struct lp_build_context intbld;
   2178       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2179       LLVMValueRef res, anosign, mask;
   2180       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2181       LLVMTypeRef vec_type = bld->vec_type;
   2182 
   2183       assert(type.width == 32); /* might want to handle doubles at some point */
   2184 
   2185       inttype = type;
   2186       inttype.floating = 0;
   2187       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2188 
   2189       res = lp_build_iround(bld, a);
   2190       res = LLVMBuildSIToFP(builder, res, vec_type, "");
   2191 
   2192       /* mask out sign bit */
   2193       anosign = lp_build_abs(bld, a);
   2194       /*
   2195        * mask out all values if anosign > 2^24
   2196        * This should work both for large ints (all rounding is no-op for them
   2197        * because such floats are always exact) as well as special cases like
   2198        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2199        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2200        */
   2201       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2202       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2203       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2204       return lp_build_select(bld, mask, a, res);
   2205    }
   2206 }
   2207 
   2208 
   2209 /**
   2210  * Return floor of float (vector), result is a float (vector)
   2211  * Ex: floor(1.1) = 1.0
   2212  * Ex: floor(-1.1) = -2.0
   2213  */
   2214 LLVMValueRef
   2215 lp_build_floor(struct lp_build_context *bld,
   2216                LLVMValueRef a)
   2217 {
   2218    LLVMBuilderRef builder = bld->gallivm->builder;
   2219    const struct lp_type type = bld->type;
   2220 
   2221    assert(type.floating);
   2222    assert(lp_check_value(type, a));
   2223 
   2224    if (arch_rounding_available(type)) {
   2225       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
   2226    }
   2227    else {
   2228       const struct lp_type type = bld->type;
   2229       struct lp_type inttype;
   2230       struct lp_build_context intbld;
   2231       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2232       LLVMValueRef trunc, res, anosign, mask;
   2233       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2234       LLVMTypeRef vec_type = bld->vec_type;
   2235 
   2236       if (type.width != 32) {
   2237          char intrinsic[32];
   2238          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
   2239          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   2240       }
   2241 
   2242       assert(type.width == 32); /* might want to handle doubles at some point */
   2243 
   2244       inttype = type;
   2245       inttype.floating = 0;
   2246       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2247 
   2248       /* round by truncation */
   2249       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2250       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
   2251 
   2252       if (type.sign) {
   2253          LLVMValueRef tmp;
   2254 
   2255          /*
   2256           * fix values if rounding is wrong (for non-special cases)
   2257           * - this is the case if trunc > a
   2258           */
   2259          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
   2260          /* tmp = trunc > a ? 1.0 : 0.0 */
   2261          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
   2262          tmp = lp_build_and(&intbld, mask, tmp);
   2263          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
   2264          res = lp_build_sub(bld, res, tmp);
   2265       }
   2266 
   2267       /* mask out sign bit */
   2268       anosign = lp_build_abs(bld, a);
   2269       /*
   2270        * mask out all values if anosign > 2^24
   2271        * This should work both for large ints (all rounding is no-op for them
   2272        * because such floats are always exact) as well as special cases like
   2273        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2274        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2275        */
   2276       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2277       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2278       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2279       return lp_build_select(bld, mask, a, res);
   2280    }
   2281 }
   2282 
   2283 
   2284 /**
   2285  * Return ceiling of float (vector), returning float (vector).
   2286  * Ex: ceil( 1.1) = 2.0
   2287  * Ex: ceil(-1.1) = -1.0
   2288  */
   2289 LLVMValueRef
   2290 lp_build_ceil(struct lp_build_context *bld,
   2291               LLVMValueRef a)
   2292 {
   2293    LLVMBuilderRef builder = bld->gallivm->builder;
   2294    const struct lp_type type = bld->type;
   2295 
   2296    assert(type.floating);
   2297    assert(lp_check_value(type, a));
   2298 
   2299    if (arch_rounding_available(type)) {
   2300       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
   2301    }
   2302    else {
   2303       const struct lp_type type = bld->type;
   2304       struct lp_type inttype;
   2305       struct lp_build_context intbld;
   2306       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
   2307       LLVMValueRef trunc, res, anosign, mask, tmp;
   2308       LLVMTypeRef int_vec_type = bld->int_vec_type;
   2309       LLVMTypeRef vec_type = bld->vec_type;
   2310 
   2311       if (type.width != 32) {
   2312          char intrinsic[32];
   2313          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
   2314          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   2315       }
   2316 
   2317       assert(type.width == 32); /* might want to handle doubles at some point */
   2318 
   2319       inttype = type;
   2320       inttype.floating = 0;
   2321       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2322 
   2323       /* round by truncation */
   2324       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2325       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
   2326 
   2327       /*
   2328        * fix values if rounding is wrong (for non-special cases)
   2329        * - this is the case if trunc < a
   2330        */
   2331       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
   2332       /* tmp = trunc < a ? 1.0 : 0.0 */
   2333       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
   2334       tmp = lp_build_and(&intbld, mask, tmp);
   2335       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
   2336       res = lp_build_add(bld, trunc, tmp);
   2337 
   2338       /* mask out sign bit */
   2339       anosign = lp_build_abs(bld, a);
   2340       /*
   2341        * mask out all values if anosign > 2^24
   2342        * This should work both for large ints (all rounding is no-op for them
   2343        * because such floats are always exact) as well as special cases like
   2344        * NaNs, Infs (taking advantage of the fact they use max exponent).
   2345        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
   2346        */
   2347       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
   2348       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
   2349       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
   2350       return lp_build_select(bld, mask, a, res);
   2351    }
   2352 }
   2353 
   2354 
   2355 /**
   2356  * Return fractional part of 'a' computed as a - floor(a)
   2357  * Typically used in texture coord arithmetic.
   2358  */
   2359 LLVMValueRef
   2360 lp_build_fract(struct lp_build_context *bld,
   2361                LLVMValueRef a)
   2362 {
   2363    assert(bld->type.floating);
   2364    return lp_build_sub(bld, a, lp_build_floor(bld, a));
   2365 }
   2366 
   2367 
   2368 /**
   2369  * Prevent returning 1.0 for very small negative values of 'a' by clamping
   2370  * against 0.99999(9). (Will also return that value for NaNs.)
   2371  */
   2372 static inline LLVMValueRef
   2373 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
   2374 {
   2375    LLVMValueRef max;
   2376 
   2377    /* this is the largest number smaller than 1.0 representable as float */
   2378    max = lp_build_const_vec(bld->gallivm, bld->type,
   2379                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
   2380    return lp_build_min_ext(bld, fract, max,
   2381                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   2382 }
   2383 
   2384 
   2385 /**
   2386  * Same as lp_build_fract, but guarantees that the result is always smaller
   2387  * than one. Will also return the smaller-than-one value for infs, NaNs.
   2388  */
   2389 LLVMValueRef
   2390 lp_build_fract_safe(struct lp_build_context *bld,
   2391                     LLVMValueRef a)
   2392 {
   2393    return clamp_fract(bld, lp_build_fract(bld, a));
   2394 }
   2395 
   2396 
   2397 /**
   2398  * Return the integer part of a float (vector) value (== round toward zero).
   2399  * The returned value is an integer (vector).
   2400  * Ex: itrunc(-1.5) = -1
   2401  */
   2402 LLVMValueRef
   2403 lp_build_itrunc(struct lp_build_context *bld,
   2404                 LLVMValueRef a)
   2405 {
   2406    LLVMBuilderRef builder = bld->gallivm->builder;
   2407    const struct lp_type type = bld->type;
   2408    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   2409 
   2410    assert(type.floating);
   2411    assert(lp_check_value(type, a));
   2412 
   2413    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2414 }
   2415 
   2416 
   2417 /**
   2418  * Return float (vector) rounded to nearest integer (vector).  The returned
   2419  * value is an integer (vector).
   2420  * Ex: iround(0.9) = 1
   2421  * Ex: iround(-1.5) = -2
   2422  */
   2423 LLVMValueRef
   2424 lp_build_iround(struct lp_build_context *bld,
   2425                 LLVMValueRef a)
   2426 {
   2427    LLVMBuilderRef builder = bld->gallivm->builder;
   2428    const struct lp_type type = bld->type;
   2429    LLVMTypeRef int_vec_type = bld->int_vec_type;
   2430    LLVMValueRef res;
   2431 
   2432    assert(type.floating);
   2433 
   2434    assert(lp_check_value(type, a));
   2435 
   2436    if ((util_cpu_caps.has_sse2 &&
   2437        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
   2438        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
   2439       return lp_build_iround_nearest_sse2(bld, a);
   2440    }
   2441    if (arch_rounding_available(type)) {
   2442       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
   2443    }
   2444    else {
   2445       LLVMValueRef half;
   2446 
   2447       half = lp_build_const_vec(bld->gallivm, type, 0.5);
   2448 
   2449       if (type.sign) {
   2450          LLVMTypeRef vec_type = bld->vec_type;
   2451          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
   2452                                     (unsigned long long)1 << (type.width - 1));
   2453          LLVMValueRef sign;
   2454 
   2455          /* get sign bit */
   2456          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
   2457          sign = LLVMBuildAnd(builder, sign, mask, "");
   2458 
   2459          /* sign * 0.5 */
   2460          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
   2461          half = LLVMBuildOr(builder, sign, half, "");
   2462          half = LLVMBuildBitCast(builder, half, vec_type, "");
   2463       }
   2464 
   2465       res = LLVMBuildFAdd(builder, a, half, "");
   2466    }
   2467 
   2468    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
   2469 
   2470    return res;
   2471 }
   2472 
   2473 
   2474 /**
   2475  * Return floor of float (vector), result is an int (vector)
   2476  * Ex: ifloor(1.1) = 1.0
   2477  * Ex: ifloor(-1.1) = -2.0
   2478  */
   2479 LLVMValueRef
   2480 lp_build_ifloor(struct lp_build_context *bld,
   2481                 LLVMValueRef a)
   2482 {
   2483    LLVMBuilderRef builder = bld->gallivm->builder;
   2484    const struct lp_type type = bld->type;
   2485    LLVMTypeRef int_vec_type = bld->int_vec_type;
   2486    LLVMValueRef res;
   2487 
   2488    assert(type.floating);
   2489    assert(lp_check_value(type, a));
   2490 
   2491    res = a;
   2492    if (type.sign) {
   2493       if (arch_rounding_available(type)) {
   2494          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
   2495       }
   2496       else {
   2497          struct lp_type inttype;
   2498          struct lp_build_context intbld;
   2499          LLVMValueRef trunc, itrunc, mask;
   2500 
   2501          assert(type.floating);
   2502          assert(lp_check_value(type, a));
   2503 
   2504          inttype = type;
   2505          inttype.floating = 0;
   2506          lp_build_context_init(&intbld, bld->gallivm, inttype);
   2507 
   2508          /* round by truncation */
   2509          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2510          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
   2511 
   2512          /*
   2513           * fix values if rounding is wrong (for non-special cases)
   2514           * - this is the case if trunc > a
   2515           * The results of doing this with NaNs, very large values etc.
   2516           * are undefined but this seems to be the case anyway.
   2517           */
   2518          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
   2519          /* cheapie minus one with mask since the mask is minus one / zero */
   2520          return lp_build_add(&intbld, itrunc, mask);
   2521       }
   2522    }
   2523 
   2524    /* round to nearest (toward zero) */
   2525    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
   2526 
   2527    return res;
   2528 }
   2529 
   2530 
   2531 /**
   2532  * Return ceiling of float (vector), returning int (vector).
   2533  * Ex: iceil( 1.1) = 2
   2534  * Ex: iceil(-1.1) = -1
   2535  */
   2536 LLVMValueRef
   2537 lp_build_iceil(struct lp_build_context *bld,
   2538                LLVMValueRef a)
   2539 {
   2540    LLVMBuilderRef builder = bld->gallivm->builder;
   2541    const struct lp_type type = bld->type;
   2542    LLVMTypeRef int_vec_type = bld->int_vec_type;
   2543    LLVMValueRef res;
   2544 
   2545    assert(type.floating);
   2546    assert(lp_check_value(type, a));
   2547 
   2548    if (arch_rounding_available(type)) {
   2549       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
   2550    }
   2551    else {
   2552       struct lp_type inttype;
   2553       struct lp_build_context intbld;
   2554       LLVMValueRef trunc, itrunc, mask;
   2555 
   2556       assert(type.floating);
   2557       assert(lp_check_value(type, a));
   2558 
   2559       inttype = type;
   2560       inttype.floating = 0;
   2561       lp_build_context_init(&intbld, bld->gallivm, inttype);
   2562 
   2563       /* round by truncation */
   2564       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
   2565       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
   2566 
   2567       /*
   2568        * fix values if rounding is wrong (for non-special cases)
   2569        * - this is the case if trunc < a
   2570        * The results of doing this with NaNs, very large values etc.
   2571        * are undefined but this seems to be the case anyway.
   2572        */
   2573       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
   2574       /* cheapie plus one with mask since the mask is minus one / zero */
   2575       return lp_build_sub(&intbld, itrunc, mask);
   2576    }
   2577 
   2578    /* round to nearest (toward zero) */
   2579    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
   2580 
   2581    return res;
   2582 }
   2583 
   2584 
   2585 /**
   2586  * Combined ifloor() & fract().
   2587  *
   2588  * Preferred to calling the functions separately, as it will ensure that the
   2589  * strategy (floor() vs ifloor()) that results in less redundant work is used.
   2590  */
   2591 void
   2592 lp_build_ifloor_fract(struct lp_build_context *bld,
   2593                       LLVMValueRef a,
   2594                       LLVMValueRef *out_ipart,
   2595                       LLVMValueRef *out_fpart)
   2596 {
   2597    LLVMBuilderRef builder = bld->gallivm->builder;
   2598    const struct lp_type type = bld->type;
   2599    LLVMValueRef ipart;
   2600 
   2601    assert(type.floating);
   2602    assert(lp_check_value(type, a));
   2603 
   2604    if (arch_rounding_available(type)) {
   2605       /*
   2606        * floor() is easier.
   2607        */
   2608 
   2609       ipart = lp_build_floor(bld, a);
   2610       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
   2611       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
   2612    }
   2613    else {
   2614       /*
   2615        * ifloor() is easier.
   2616        */
   2617 
   2618       *out_ipart = lp_build_ifloor(bld, a);
   2619       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
   2620       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
   2621    }
   2622 }
   2623 
   2624 
   2625 /**
   2626  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
   2627  * always smaller than one.
   2628  */
   2629 void
   2630 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
   2631                            LLVMValueRef a,
   2632                            LLVMValueRef *out_ipart,
   2633                            LLVMValueRef *out_fpart)
   2634 {
   2635    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
   2636    *out_fpart = clamp_fract(bld, *out_fpart);
   2637 }
   2638 
   2639 
   2640 LLVMValueRef
   2641 lp_build_sqrt(struct lp_build_context *bld,
   2642               LLVMValueRef a)
   2643 {
   2644    LLVMBuilderRef builder = bld->gallivm->builder;
   2645    const struct lp_type type = bld->type;
   2646    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   2647    char intrinsic[32];
   2648 
   2649    assert(lp_check_value(type, a));
   2650 
   2651    assert(type.floating);
   2652    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
   2653 
   2654    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
   2655 }
   2656 
   2657 
   2658 /**
   2659  * Do one Newton-Raphson step to improve reciprocate precision:
   2660  *
   2661  *   x_{i+1} = x_i * (2 - a * x_i)
   2662  *
   2663  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
   2664  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
   2665  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
   2666  * halo. It would be necessary to clamp the argument to prevent this.
   2667  *
   2668  * See also:
   2669  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
   2670  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
   2671  */
   2672 static inline LLVMValueRef
   2673 lp_build_rcp_refine(struct lp_build_context *bld,
   2674                     LLVMValueRef a,
   2675                     LLVMValueRef rcp_a)
   2676 {
   2677    LLVMBuilderRef builder = bld->gallivm->builder;
   2678    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
   2679    LLVMValueRef res;
   2680 
   2681    res = LLVMBuildFMul(builder, a, rcp_a, "");
   2682    res = LLVMBuildFSub(builder, two, res, "");
   2683    res = LLVMBuildFMul(builder, rcp_a, res, "");
   2684 
   2685    return res;
   2686 }
   2687 
   2688 
   2689 LLVMValueRef
   2690 lp_build_rcp(struct lp_build_context *bld,
   2691              LLVMValueRef a)
   2692 {
   2693    LLVMBuilderRef builder = bld->gallivm->builder;
   2694    const struct lp_type type = bld->type;
   2695 
   2696    assert(lp_check_value(type, a));
   2697 
   2698    if(a == bld->zero)
   2699       return bld->undef;
   2700    if(a == bld->one)
   2701       return bld->one;
   2702    if(a == bld->undef)
   2703       return bld->undef;
   2704 
   2705    assert(type.floating);
   2706 
   2707    if(LLVMIsConstant(a))
   2708       return LLVMConstFDiv(bld->one, a);
   2709 
   2710    /*
   2711     * We don't use RCPPS because:
   2712     * - it only has 10bits of precision
   2713     * - it doesn't even get the reciprocate of 1.0 exactly
   2714     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
   2715     * - for recent processors the benefit over DIVPS is marginal, a case
   2716     *   dependent
   2717     *
   2718     * We could still use it on certain processors if benchmarks show that the
   2719     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
   2720     * particular uses that require less workarounds.
   2721     */
   2722 
   2723    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   2724          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
   2725       const unsigned num_iterations = 0;
   2726       LLVMValueRef res;
   2727       unsigned i;
   2728       const char *intrinsic = NULL;
   2729 
   2730       if (type.length == 4) {
   2731          intrinsic = "llvm.x86.sse.rcp.ps";
   2732       }
   2733       else {
   2734          intrinsic = "llvm.x86.avx.rcp.ps.256";
   2735       }
   2736 
   2737       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2738 
   2739       for (i = 0; i < num_iterations; ++i) {
   2740          res = lp_build_rcp_refine(bld, a, res);
   2741       }
   2742 
   2743       return res;
   2744    }
   2745 
   2746    return LLVMBuildFDiv(builder, bld->one, a, "");
   2747 }
   2748 
   2749 
   2750 /**
   2751  * Do one Newton-Raphson step to improve rsqrt precision:
   2752  *
   2753  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
   2754  *
   2755  * See also Intel 64 and IA-32 Architectures Optimization Manual.
   2756  */
   2757 static inline LLVMValueRef
   2758 lp_build_rsqrt_refine(struct lp_build_context *bld,
   2759                       LLVMValueRef a,
   2760                       LLVMValueRef rsqrt_a)
   2761 {
   2762    LLVMBuilderRef builder = bld->gallivm->builder;
   2763    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
   2764    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
   2765    LLVMValueRef res;
   2766 
   2767    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
   2768    res = LLVMBuildFMul(builder, a, res, "");
   2769    res = LLVMBuildFSub(builder, three, res, "");
   2770    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
   2771    res = LLVMBuildFMul(builder, half, res, "");
   2772 
   2773    return res;
   2774 }
   2775 
   2776 
   2777 /**
   2778  * Generate 1/sqrt(a).
   2779  * Result is undefined for values < 0, infinity for +0.
   2780  */
   2781 LLVMValueRef
   2782 lp_build_rsqrt(struct lp_build_context *bld,
   2783                LLVMValueRef a)
   2784 {
   2785    const struct lp_type type = bld->type;
   2786 
   2787    assert(lp_check_value(type, a));
   2788 
   2789    assert(type.floating);
   2790 
   2791    /*
   2792     * This should be faster but all denormals will end up as infinity.
   2793     */
   2794    if (0 && lp_build_fast_rsqrt_available(type)) {
   2795       const unsigned num_iterations = 1;
   2796       LLVMValueRef res;
   2797       unsigned i;
   2798 
   2799       /* rsqrt(1.0) != 1.0 here */
   2800       res = lp_build_fast_rsqrt(bld, a);
   2801 
   2802       if (num_iterations) {
   2803          /*
   2804           * Newton-Raphson will result in NaN instead of infinity for zero,
   2805           * and NaN instead of zero for infinity.
   2806           * Also, need to ensure rsqrt(1.0) == 1.0.
   2807           * All numbers smaller than FLT_MIN will result in +infinity
   2808           * (rsqrtps treats all denormals as zero).
   2809           */
   2810          LLVMValueRef cmp;
   2811          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
   2812          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
   2813 
   2814          for (i = 0; i < num_iterations; ++i) {
   2815             res = lp_build_rsqrt_refine(bld, a, res);
   2816          }
   2817          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
   2818          res = lp_build_select(bld, cmp, inf, res);
   2819          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
   2820          res = lp_build_select(bld, cmp, bld->zero, res);
   2821          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
   2822          res = lp_build_select(bld, cmp, bld->one, res);
   2823       }
   2824 
   2825       return res;
   2826    }
   2827 
   2828    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
   2829 }
   2830 
   2831 /**
   2832  * If there's a fast (inaccurate) rsqrt instruction available
   2833  * (caller may want to avoid to call rsqrt_fast if it's not available,
   2834  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
   2835  * unavailable it would result in sqrt/div/mul so obviously
   2836  * much better to just call sqrt, skipping both div and mul).
   2837  */
   2838 boolean
   2839 lp_build_fast_rsqrt_available(struct lp_type type)
   2840 {
   2841    assert(type.floating);
   2842 
   2843    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
   2844        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
   2845       return true;
   2846    }
   2847    return false;
   2848 }
   2849 
   2850 
   2851 /**
   2852  * Generate 1/sqrt(a).
   2853  * Result is undefined for values < 0, infinity for +0.
   2854  * Precision is limited, only ~10 bits guaranteed
   2855  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
   2856  */
   2857 LLVMValueRef
   2858 lp_build_fast_rsqrt(struct lp_build_context *bld,
   2859                     LLVMValueRef a)
   2860 {
   2861    LLVMBuilderRef builder = bld->gallivm->builder;
   2862    const struct lp_type type = bld->type;
   2863 
   2864    assert(lp_check_value(type, a));
   2865 
   2866    if (lp_build_fast_rsqrt_available(type)) {
   2867       const char *intrinsic = NULL;
   2868 
   2869       if (type.length == 4) {
   2870          intrinsic = "llvm.x86.sse.rsqrt.ps";
   2871       }
   2872       else {
   2873          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
   2874       }
   2875       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
   2876    }
   2877    else {
   2878       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
   2879    }
   2880    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
   2881 }
   2882 
   2883 
   2884 /**
   2885  * Generate sin(a) or cos(a) using polynomial approximation.
   2886  * TODO: it might be worth recognizing sin and cos using same source
   2887  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
   2888  * would be way cheaper than calculating (nearly) everything twice...
   2889  * Not sure it's common enough to be worth bothering however, scs
   2890  * opcode could also benefit from calculating both though.
   2891  */
   2892 static LLVMValueRef
   2893 lp_build_sin_or_cos(struct lp_build_context *bld,
   2894                     LLVMValueRef a,
   2895                     boolean cos)
   2896 {
   2897    struct gallivm_state *gallivm = bld->gallivm;
   2898    LLVMBuilderRef b = gallivm->builder;
   2899    struct lp_type int_type = lp_int_type(bld->type);
   2900 
   2901    /*
   2902     *  take the absolute value,
   2903     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
   2904     */
   2905 
   2906    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
   2907    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
   2908 
   2909    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
   2910    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
   2911 
   2912    /*
   2913     * scale by 4/Pi
   2914     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
   2915     */
   2916 
   2917    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
   2918    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
   2919 
   2920    /*
   2921     * store the integer part of y in mm0
   2922     * emm2 = _mm_cvttps_epi32(y);
   2923     */
   2924 
   2925    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
   2926 
   2927    /*
   2928     * j=(j+1) & (~1) (see the cephes sources)
   2929     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
   2930     */
   2931 
   2932    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
   2933    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
   2934    /*
   2935     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
   2936     */
   2937    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
   2938    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
   2939 
   2940    /*
   2941     * y = _mm_cvtepi32_ps(emm2);
   2942     */
   2943    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
   2944 
   2945    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
   2946    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
   2947    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
   2948    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
   2949 
   2950    /*
   2951     * Argument used for poly selection and sign bit determination
   2952     * is different for sin vs. cos.
   2953     */
   2954    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
   2955                                emm2_and;
   2956 
   2957    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
   2958                                                               LLVMBuildNot(b, emm2_2, ""), ""),
   2959                                               const_29, "sign_bit") :
   2960                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
   2961                                                               LLVMBuildShl(b, emm2_add,
   2962                                                                            const_29, ""), ""),
   2963                                               sign_mask, "sign_bit");
   2964 
   2965    /*
   2966     * get the polynom selection mask
   2967     * there is one polynom for 0 <= x <= Pi/4
   2968     * and another one for Pi/4<x<=Pi/2
   2969     * Both branches will be computed.
   2970     *
   2971     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
   2972     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   2973     */
   2974 
   2975    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
   2976    LLVMValueRef poly_mask = lp_build_compare(gallivm,
   2977                                              int_type, PIPE_FUNC_EQUAL,
   2978                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
   2979 
   2980    /*
   2981     * _PS_CONST(minus_cephes_DP1, -0.78515625);
   2982     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
   2983     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
   2984     */
   2985    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
   2986    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
   2987    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
   2988 
   2989    /*
   2990     * The magic pass: "Extended precision modular arithmetic"
   2991     * x = ((x - y * DP1) - y * DP2) - y * DP3;
   2992     */
   2993    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
   2994    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
   2995    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
   2996 
   2997    /*
   2998     * Evaluate the first polynom  (0 <= x <= Pi/4)
   2999     *
   3000     * z = _mm_mul_ps(x,x);
   3001     */
   3002    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
   3003 
   3004    /*
   3005     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
   3006     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
   3007     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
   3008     */
   3009    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
   3010    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
   3011    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
   3012 
   3013    /*
   3014     * y = *(v4sf*)_ps_coscof_p0;
   3015     * y = _mm_mul_ps(y, z);
   3016     */
   3017    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
   3018    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
   3019    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
   3020    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
   3021 
   3022 
   3023    /*
   3024     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
   3025     * y = _mm_sub_ps(y, tmp);
   3026     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
   3027     */
   3028    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
   3029    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
   3030    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
   3031    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
   3032    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
   3033 
   3034    /*
   3035     * _PS_CONST(sincof_p0, -1.9515295891E-4);
   3036     * _PS_CONST(sincof_p1,  8.3321608736E-3);
   3037     * _PS_CONST(sincof_p2, -1.6666654611E-1);
   3038     */
   3039    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
   3040    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
   3041    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
   3042 
   3043    /*
   3044     * Evaluate the second polynom  (Pi/4 <= x <= 0)
   3045     *
   3046     * y2 = *(v4sf*)_ps_sincof_p0;
   3047     * y2 = _mm_mul_ps(y2, z);
   3048     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
   3049     * y2 = _mm_mul_ps(y2, z);
   3050     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
   3051     * y2 = _mm_mul_ps(y2, z);
   3052     * y2 = _mm_mul_ps(y2, x);
   3053     * y2 = _mm_add_ps(y2, x);
   3054     */
   3055 
   3056    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
   3057    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
   3058    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
   3059    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
   3060 
   3061    /*
   3062     * select the correct result from the two polynoms
   3063     * xmm3 = poly_mask;
   3064     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   3065     * y = _mm_andnot_ps(xmm3, y);
   3066     * y = _mm_or_ps(y,y2);
   3067     */
   3068    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
   3069    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
   3070    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
   3071    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
   3072    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
   3073    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
   3074 
   3075    /*
   3076     * update the sign
   3077     * y = _mm_xor_ps(y, sign_bit);
   3078     */
   3079    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
   3080    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
   3081 
   3082    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
   3083 
   3084    /* clamp output to be within [-1, 1] */
   3085    y_result = lp_build_clamp(bld, y_result,
   3086                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
   3087                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
   3088    /* If a is -inf, inf or NaN then return NaN */
   3089    y_result = lp_build_select(bld, isfinite, y_result,
   3090                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
   3091    return y_result;
   3092 }
   3093 
   3094 
   3095 /**
   3096  * Generate sin(a)
   3097  */
   3098 LLVMValueRef
   3099 lp_build_sin(struct lp_build_context *bld,
   3100              LLVMValueRef a)
   3101 {
   3102    return lp_build_sin_or_cos(bld, a, FALSE);
   3103 }
   3104 
   3105 
   3106 /**
   3107  * Generate cos(a)
   3108  */
   3109 LLVMValueRef
   3110 lp_build_cos(struct lp_build_context *bld,
   3111              LLVMValueRef a)
   3112 {
   3113    return lp_build_sin_or_cos(bld, a, TRUE);
   3114 }
   3115 
   3116 
   3117 /**
   3118  * Generate pow(x, y)
   3119  */
   3120 LLVMValueRef
   3121 lp_build_pow(struct lp_build_context *bld,
   3122              LLVMValueRef x,
   3123              LLVMValueRef y)
   3124 {
   3125    /* TODO: optimize the constant case */
   3126    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3127        LLVMIsConstant(x) && LLVMIsConstant(y)) {
   3128       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3129                    __FUNCTION__);
   3130    }
   3131 
   3132    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
   3133 }
   3134 
   3135 
   3136 /**
   3137  * Generate exp(x)
   3138  */
   3139 LLVMValueRef
   3140 lp_build_exp(struct lp_build_context *bld,
   3141              LLVMValueRef x)
   3142 {
   3143    /* log2(e) = 1/log(2) */
   3144    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
   3145                                            1.4426950408889634);
   3146 
   3147    assert(lp_check_value(bld->type, x));
   3148 
   3149    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
   3150 }
   3151 
   3152 
   3153 /**
   3154  * Generate log(x)
   3155  * Behavior is undefined with infs, 0s and nans
   3156  */
   3157 LLVMValueRef
   3158 lp_build_log(struct lp_build_context *bld,
   3159              LLVMValueRef x)
   3160 {
   3161    /* log(2) */
   3162    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
   3163                                           0.69314718055994529);
   3164 
   3165    assert(lp_check_value(bld->type, x));
   3166 
   3167    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
   3168 }
   3169 
   3170 /**
   3171  * Generate log(x) that handles edge cases (infs, 0s and nans)
   3172  */
   3173 LLVMValueRef
   3174 lp_build_log_safe(struct lp_build_context *bld,
   3175                   LLVMValueRef x)
   3176 {
   3177    /* log(2) */
   3178    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
   3179                                           0.69314718055994529);
   3180 
   3181    assert(lp_check_value(bld->type, x));
   3182 
   3183    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
   3184 }
   3185 
   3186 
   3187 /**
   3188  * Generate polynomial.
   3189  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
   3190  */
   3191 LLVMValueRef
   3192 lp_build_polynomial(struct lp_build_context *bld,
   3193                     LLVMValueRef x,
   3194                     const double *coeffs,
   3195                     unsigned num_coeffs)
   3196 {
   3197    const struct lp_type type = bld->type;
   3198    LLVMValueRef even = NULL, odd = NULL;
   3199    LLVMValueRef x2;
   3200    unsigned i;
   3201 
   3202    assert(lp_check_value(bld->type, x));
   3203 
   3204    /* TODO: optimize the constant case */
   3205    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3206        LLVMIsConstant(x)) {
   3207       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3208                    __FUNCTION__);
   3209    }
   3210 
   3211    /*
   3212     * Calculate odd and even terms seperately to decrease data dependency
   3213     * Ex:
   3214     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
   3215     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
   3216     */
   3217    x2 = lp_build_mul(bld, x, x);
   3218 
   3219    for (i = num_coeffs; i--; ) {
   3220       LLVMValueRef coeff;
   3221 
   3222       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
   3223 
   3224       if (i % 2 == 0) {
   3225          if (even)
   3226             even = lp_build_mad(bld, x2, even, coeff);
   3227          else
   3228             even = coeff;
   3229       } else {
   3230          if (odd)
   3231             odd = lp_build_mad(bld, x2, odd, coeff);
   3232          else
   3233             odd = coeff;
   3234       }
   3235    }
   3236 
   3237    if (odd)
   3238       return lp_build_mad(bld, odd, x, even);
   3239    else if (even)
   3240       return even;
   3241    else
   3242       return bld->undef;
   3243 }
   3244 
   3245 
   3246 /**
   3247  * Minimax polynomial fit of 2**x, in range [0, 1[
   3248  */
   3249 const double lp_build_exp2_polynomial[] = {
   3250 #if EXP_POLY_DEGREE == 5
   3251    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
   3252    0.693153073200168932794,
   3253    0.240153617044375388211,
   3254    0.0558263180532956664775,
   3255    0.00898934009049466391101,
   3256    0.00187757667519147912699
   3257 #elif EXP_POLY_DEGREE == 4
   3258    1.00000259337069434683,
   3259    0.693003834469974940458,
   3260    0.24144275689150793076,
   3261    0.0520114606103070150235,
   3262    0.0135341679161270268764
   3263 #elif EXP_POLY_DEGREE == 3
   3264    0.999925218562710312959,
   3265    0.695833540494823811697,
   3266    0.226067155427249155588,
   3267    0.0780245226406372992967
   3268 #elif EXP_POLY_DEGREE == 2
   3269    1.00172476321474503578,
   3270    0.657636275736077639316,
   3271    0.33718943461968720704
   3272 #else
   3273 #error
   3274 #endif
   3275 };
   3276 
   3277 
   3278 LLVMValueRef
   3279 lp_build_exp2(struct lp_build_context *bld,
   3280               LLVMValueRef x)
   3281 {
   3282    LLVMBuilderRef builder = bld->gallivm->builder;
   3283    const struct lp_type type = bld->type;
   3284    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   3285    LLVMValueRef ipart = NULL;
   3286    LLVMValueRef fpart = NULL;
   3287    LLVMValueRef expipart = NULL;
   3288    LLVMValueRef expfpart = NULL;
   3289    LLVMValueRef res = NULL;
   3290 
   3291    assert(lp_check_value(bld->type, x));
   3292 
   3293    /* TODO: optimize the constant case */
   3294    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3295        LLVMIsConstant(x)) {
   3296       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3297                    __FUNCTION__);
   3298    }
   3299 
   3300    assert(type.floating && type.width == 32);
   3301 
   3302    /* We want to preserve NaN and make sure than for exp2 if x > 128,
   3303     * the result is INF  and if it's smaller than -126.9 the result is 0 */
   3304    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
   3305                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
   3306    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
   3307                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
   3308 
   3309    /* ipart = floor(x) */
   3310    /* fpart = x - ipart */
   3311    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
   3312 
   3313    /* expipart = (float) (1 << ipart) */
   3314    expipart = LLVMBuildAdd(builder, ipart,
   3315                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
   3316    expipart = LLVMBuildShl(builder, expipart,
   3317                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
   3318    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
   3319 
   3320    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
   3321                                   ARRAY_SIZE(lp_build_exp2_polynomial));
   3322 
   3323    res = LLVMBuildFMul(builder, expipart, expfpart, "");
   3324 
   3325    return res;
   3326 }
   3327 
   3328 
   3329 
   3330 /**
   3331  * Extract the exponent of a IEEE-754 floating point value.
   3332  *
   3333  * Optionally apply an integer bias.
   3334  *
   3335  * Result is an integer value with
   3336  *
   3337  *   ifloor(log2(x)) + bias
   3338  */
   3339 LLVMValueRef
   3340 lp_build_extract_exponent(struct lp_build_context *bld,
   3341                           LLVMValueRef x,
   3342                           int bias)
   3343 {
   3344    LLVMBuilderRef builder = bld->gallivm->builder;
   3345    const struct lp_type type = bld->type;
   3346    unsigned mantissa = lp_mantissa(type);
   3347    LLVMValueRef res;
   3348 
   3349    assert(type.floating);
   3350 
   3351    assert(lp_check_value(bld->type, x));
   3352 
   3353    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
   3354 
   3355    res = LLVMBuildLShr(builder, x,
   3356                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
   3357    res = LLVMBuildAnd(builder, res,
   3358                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
   3359    res = LLVMBuildSub(builder, res,
   3360                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
   3361 
   3362    return res;
   3363 }
   3364 
   3365 
   3366 /**
   3367  * Extract the mantissa of the a floating.
   3368  *
   3369  * Result is a floating point value with
   3370  *
   3371  *   x / floor(log2(x))
   3372  */
   3373 LLVMValueRef
   3374 lp_build_extract_mantissa(struct lp_build_context *bld,
   3375                           LLVMValueRef x)
   3376 {
   3377    LLVMBuilderRef builder = bld->gallivm->builder;
   3378    const struct lp_type type = bld->type;
   3379    unsigned mantissa = lp_mantissa(type);
   3380    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
   3381                                                   (1ULL << mantissa) - 1);
   3382    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
   3383    LLVMValueRef res;
   3384 
   3385    assert(lp_check_value(bld->type, x));
   3386 
   3387    assert(type.floating);
   3388 
   3389    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
   3390 
   3391    /* res = x / 2**ipart */
   3392    res = LLVMBuildAnd(builder, x, mantmask, "");
   3393    res = LLVMBuildOr(builder, res, one, "");
   3394    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
   3395 
   3396    return res;
   3397 }
   3398 
   3399 
   3400 
   3401 /**
   3402  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
   3403  * These coefficients can be generate with
   3404  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
   3405  */
   3406 const double lp_build_log2_polynomial[] = {
   3407 #if LOG_POLY_DEGREE == 5
   3408    2.88539008148777786488L,
   3409    0.961796878841293367824L,
   3410    0.577058946784739859012L,
   3411    0.412914355135828735411L,
   3412    0.308591899232910175289L,
   3413    0.352376952300281371868L,
   3414 #elif LOG_POLY_DEGREE == 4
   3415    2.88539009343309178325L,
   3416    0.961791550404184197881L,
   3417    0.577440339438736392009L,
   3418    0.403343858251329912514L,
   3419    0.406718052498846252698L,
   3420 #elif LOG_POLY_DEGREE == 3
   3421    2.88538959748872753838L,
   3422    0.961932915889597772928L,
   3423    0.571118517972136195241L,
   3424    0.493997535084709500285L,
   3425 #else
   3426 #error
   3427 #endif
   3428 };
   3429 
   3430 /**
   3431  * See http://www.devmaster.net/forums/showthread.php?p=43580
   3432  * http://en.wikipedia.org/wiki/Logarithm#Calculation
   3433  * http://www.nezumi.demon.co.uk/consult/logx.htm
   3434  *
   3435  * If handle_edge_cases is true the function will perform computations
   3436  * to match the required D3D10+ behavior for each of the edge cases.
   3437  * That means that if input is:
   3438  * - less than zero (to and including -inf) then NaN will be returned
   3439  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
   3440  * - +infinity, then +infinity will be returned
   3441  * - NaN, then NaN will be returned
   3442  *
   3443  * Those checks are fairly expensive so if you don't need them make sure
   3444  * handle_edge_cases is false.
   3445  */
   3446 void
   3447 lp_build_log2_approx(struct lp_build_context *bld,
   3448                      LLVMValueRef x,
   3449                      LLVMValueRef *p_exp,
   3450                      LLVMValueRef *p_floor_log2,
   3451                      LLVMValueRef *p_log2,
   3452                      boolean handle_edge_cases)
   3453 {
   3454    LLVMBuilderRef builder = bld->gallivm->builder;
   3455    const struct lp_type type = bld->type;
   3456    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
   3457    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
   3458 
   3459    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
   3460    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
   3461    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
   3462 
   3463    LLVMValueRef i = NULL;
   3464    LLVMValueRef y = NULL;
   3465    LLVMValueRef z = NULL;
   3466    LLVMValueRef exp = NULL;
   3467    LLVMValueRef mant = NULL;
   3468    LLVMValueRef logexp = NULL;
   3469    LLVMValueRef p_z = NULL;
   3470    LLVMValueRef res = NULL;
   3471 
   3472    assert(lp_check_value(bld->type, x));
   3473 
   3474    if(p_exp || p_floor_log2 || p_log2) {
   3475       /* TODO: optimize the constant case */
   3476       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
   3477           LLVMIsConstant(x)) {
   3478          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
   3479                       __FUNCTION__);
   3480       }
   3481 
   3482       assert(type.floating && type.width == 32);
   3483 
   3484       /*
   3485        * We don't explicitly handle denormalized numbers. They will yield a
   3486        * result in the neighbourhood of -127, which appears to be adequate
   3487        * enough.
   3488        */
   3489 
   3490       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
   3491 
   3492       /* exp = (float) exponent(x) */
   3493       exp = LLVMBuildAnd(builder, i, expmask, "");
   3494    }
   3495 
   3496    if(p_floor_log2 || p_log2) {
   3497       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
   3498       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
   3499       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
   3500    }
   3501 
   3502    if (p_log2) {
   3503       /* mant = 1 + (float) mantissa(x) */
   3504       mant = LLVMBuildAnd(builder, i, mantmask, "");
   3505       mant = LLVMBuildOr(builder, mant, one, "");
   3506       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
   3507 
   3508       /* y = (mant - 1) / (mant + 1) */
   3509       y = lp_build_div(bld,
   3510          lp_build_sub(bld, mant, bld->one),
   3511          lp_build_add(bld, mant, bld->one)
   3512       );
   3513 
   3514       /* z = y^2 */
   3515       z = lp_build_mul(bld, y, y);
   3516 
   3517       /* compute P(z) */
   3518       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
   3519                                 ARRAY_SIZE(lp_build_log2_polynomial));
   3520 
   3521       /* y * P(z) + logexp */
   3522       res = lp_build_mad(bld, y, p_z, logexp);
   3523 
   3524       if (type.floating && handle_edge_cases) {
   3525          LLVMValueRef negmask, infmask,  zmask;
   3526          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
   3527                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
   3528          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
   3529                               lp_build_const_vec(bld->gallivm, type,  0.0f));
   3530          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
   3531                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
   3532 
   3533          /* If x is qual to inf make sure we return inf */
   3534          res = lp_build_select(bld, infmask,
   3535                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
   3536                                res);
   3537          /* If x is qual to 0, return -inf */
   3538          res = lp_build_select(bld, zmask,
   3539                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
   3540                                res);
   3541          /* If x is nan or less than 0, return nan */
   3542          res = lp_build_select(bld, negmask,
   3543                                lp_build_const_vec(bld->gallivm, type,  NAN),
   3544                                res);
   3545       }
   3546    }
   3547 
   3548    if (p_exp) {
   3549       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
   3550       *p_exp = exp;
   3551    }
   3552 
   3553    if (p_floor_log2)
   3554       *p_floor_log2 = logexp;
   3555 
   3556    if (p_log2)
   3557       *p_log2 = res;
   3558 }
   3559 
   3560 
   3561 /*
   3562  * log2 implementation which doesn't have special code to
   3563  * handle edge cases (-inf, 0, inf, NaN). It's faster but
   3564  * the results for those cases are undefined.
   3565  */
   3566 LLVMValueRef
   3567 lp_build_log2(struct lp_build_context *bld,
   3568               LLVMValueRef x)
   3569 {
   3570    LLVMValueRef res;
   3571    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
   3572    return res;
   3573 }
   3574 
   3575 /*
   3576  * Version of log2 which handles all edge cases.
   3577  * Look at documentation of lp_build_log2_approx for
   3578  * description of the behavior for each of the edge cases.
   3579  */
   3580 LLVMValueRef
   3581 lp_build_log2_safe(struct lp_build_context *bld,
   3582                    LLVMValueRef x)
   3583 {
   3584    LLVMValueRef res;
   3585    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
   3586    return res;
   3587 }
   3588 
   3589 
   3590 /**
   3591  * Faster (and less accurate) log2.
   3592  *
   3593  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
   3594  *
   3595  * Piece-wise linear approximation, with exact results when x is a
   3596  * power of two.
   3597  *
   3598  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
   3599  */
   3600 LLVMValueRef
   3601 lp_build_fast_log2(struct lp_build_context *bld,
   3602                    LLVMValueRef x)
   3603 {
   3604    LLVMBuilderRef builder = bld->gallivm->builder;
   3605    LLVMValueRef ipart;
   3606    LLVMValueRef fpart;
   3607 
   3608    assert(lp_check_value(bld->type, x));
   3609 
   3610    assert(bld->type.floating);
   3611 
   3612    /* ipart = floor(log2(x)) - 1 */
   3613    ipart = lp_build_extract_exponent(bld, x, -1);
   3614    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
   3615 
   3616    /* fpart = x / 2**ipart */
   3617    fpart = lp_build_extract_mantissa(bld, x);
   3618 
   3619    /* ipart + fpart */
   3620    return LLVMBuildFAdd(builder, ipart, fpart, "");
   3621 }
   3622 
   3623 
   3624 /**
   3625  * Fast implementation of iround(log2(x)).
   3626  *
   3627  * Not an approximation -- it should give accurate results all the time.
   3628  */
   3629 LLVMValueRef
   3630 lp_build_ilog2(struct lp_build_context *bld,
   3631                LLVMValueRef x)
   3632 {
   3633    LLVMBuilderRef builder = bld->gallivm->builder;
   3634    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
   3635    LLVMValueRef ipart;
   3636 
   3637    assert(bld->type.floating);
   3638 
   3639    assert(lp_check_value(bld->type, x));
   3640 
   3641    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
   3642    x = LLVMBuildFMul(builder, x, sqrt2, "");
   3643 
   3644    /* ipart = floor(log2(x) + 0.5)  */
   3645    ipart = lp_build_extract_exponent(bld, x, 0);
   3646 
   3647    return ipart;
   3648 }
   3649 
   3650 LLVMValueRef
   3651 lp_build_mod(struct lp_build_context *bld,
   3652              LLVMValueRef x,
   3653              LLVMValueRef y)
   3654 {
   3655    LLVMBuilderRef builder = bld->gallivm->builder;
   3656    LLVMValueRef res;
   3657    const struct lp_type type = bld->type;
   3658 
   3659    assert(lp_check_value(type, x));
   3660    assert(lp_check_value(type, y));
   3661 
   3662    if (type.floating)
   3663       res = LLVMBuildFRem(builder, x, y, "");
   3664    else if (type.sign)
   3665       res = LLVMBuildSRem(builder, x, y, "");
   3666    else
   3667       res = LLVMBuildURem(builder, x, y, "");
   3668    return res;
   3669 }
   3670 
   3671 
   3672 /*
   3673  * For floating inputs it creates and returns a mask
   3674  * which is all 1's for channels which are NaN.
   3675  * Channels inside x which are not NaN will be 0.
   3676  */
   3677 LLVMValueRef
   3678 lp_build_isnan(struct lp_build_context *bld,
   3679                LLVMValueRef x)
   3680 {
   3681    LLVMValueRef mask;
   3682    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
   3683 
   3684    assert(bld->type.floating);
   3685    assert(lp_check_value(bld->type, x));
   3686 
   3687    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
   3688                         "isnotnan");
   3689    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
   3690    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
   3691    return mask;
   3692 }
   3693 
   3694 /* Returns all 1's for floating point numbers that are
   3695  * finite numbers and returns all zeros for -inf,
   3696  * inf and nan's */
   3697 LLVMValueRef
   3698 lp_build_isfinite(struct lp_build_context *bld,
   3699                   LLVMValueRef x)
   3700 {
   3701    LLVMBuilderRef builder = bld->gallivm->builder;
   3702    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
   3703    struct lp_type int_type = lp_int_type(bld->type);
   3704    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
   3705    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
   3706                                                     0x7f800000);
   3707 
   3708    if (!bld->type.floating) {
   3709       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
   3710    }
   3711    assert(bld->type.floating);
   3712    assert(lp_check_value(bld->type, x));
   3713    assert(bld->type.width == 32);
   3714 
   3715    intx = LLVMBuildAnd(builder, intx, infornan32, "");
   3716    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
   3717                            intx, infornan32);
   3718 }
   3719 
   3720 /*
   3721  * Returns true if the number is nan or inf and false otherwise.
   3722  * The input has to be a floating point vector.
   3723  */
   3724 LLVMValueRef
   3725 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
   3726                        const struct lp_type type,
   3727                        LLVMValueRef x)
   3728 {
   3729    LLVMBuilderRef builder = gallivm->builder;
   3730    struct lp_type int_type = lp_int_type(type);
   3731    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
   3732                                                 0x7f800000);
   3733    LLVMValueRef ret;
   3734 
   3735    assert(type.floating);
   3736 
   3737    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
   3738    ret = LLVMBuildAnd(builder, ret, const0, "");
   3739    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
   3740                           ret, const0);
   3741 
   3742    return ret;
   3743 }
   3744 
   3745 
   3746 LLVMValueRef
   3747 lp_build_fpstate_get(struct gallivm_state *gallivm)
   3748 {
   3749    if (util_cpu_caps.has_sse) {
   3750       LLVMBuilderRef builder = gallivm->builder;
   3751       LLVMValueRef mxcsr_ptr = lp_build_alloca(
   3752          gallivm,
   3753          LLVMInt32TypeInContext(gallivm->context),
   3754          "mxcsr_ptr");
   3755       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
   3756           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
   3757       lp_build_intrinsic(builder,
   3758                          "llvm.x86.sse.stmxcsr",
   3759                          LLVMVoidTypeInContext(gallivm->context),
   3760                          &mxcsr_ptr8, 1, 0);
   3761       return mxcsr_ptr;
   3762    }
   3763    return 0;
   3764 }
   3765 
   3766 void
   3767 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
   3768                                   boolean zero)
   3769 {
   3770    if (util_cpu_caps.has_sse) {
   3771       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
   3772       int daz_ftz = _MM_FLUSH_ZERO_MASK;
   3773 
   3774       LLVMBuilderRef builder = gallivm->builder;
   3775       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
   3776       LLVMValueRef mxcsr =
   3777          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
   3778 
   3779       if (util_cpu_caps.has_daz) {
   3780          /* Enable denormals are zero mode */
   3781          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
   3782       }
   3783       if (zero) {
   3784          mxcsr = LLVMBuildOr(builder, mxcsr,
   3785                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
   3786       } else {
   3787          mxcsr = LLVMBuildAnd(builder, mxcsr,
   3788                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
   3789       }
   3790 
   3791       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
   3792       lp_build_fpstate_set(gallivm, mxcsr_ptr);
   3793    }
   3794 }
   3795 
   3796 void
   3797 lp_build_fpstate_set(struct gallivm_state *gallivm,
   3798                      LLVMValueRef mxcsr_ptr)
   3799 {
   3800    if (util_cpu_caps.has_sse) {
   3801       LLVMBuilderRef builder = gallivm->builder;
   3802       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
   3803                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
   3804       lp_build_intrinsic(builder,
   3805                          "llvm.x86.sse.ldmxcsr",
   3806                          LLVMVoidTypeInContext(gallivm->context),
   3807                          &mxcsr_ptr, 1, 0);
   3808    }
   3809 }
   3810