Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 /**
     30  * @file
     31  * Helper functions for type conversions.
     32  *
     33  * We want to use the fastest type for a given computation whenever feasible.
     34  * The other side of this is that we need to be able convert between several
     35  * types accurately and efficiently.
     36  *
     37  * Conversion between types of different bit width is quite complex since a
     38  *
     39  * To remember there are a few invariants in type conversions:
     40  *
     41  * - register width must remain constant:
     42  *
     43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
     44  *
     45  * - total number of elements must remain constant:
     46  *
     47  *     src_type.length * num_srcs == dst_type.length * num_dsts
     48  *
     49  * It is not always possible to do the conversion both accurately and
     50  * efficiently, usually due to lack of adequate machine instructions. In these
     51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
     52  * there this functions can be used anywhere. In the future we might have a
     53  * precision parameter which can gauge the accuracy vs efficiency compromise,
     54  * but for now if the data conversion between two stages happens to be the
     55  * bottleneck, then most likely should just avoid converting at all and run
     56  * both stages with the same type.
     57  *
     58  * Make sure to run lp_test_conv unit test after any change to this file.
     59  *
     60  * @author Jose Fonseca <jfonseca (at) vmware.com>
     61  */
     62 
     63 
     64 #include "util/u_debug.h"
     65 #include "util/u_math.h"
     66 #include "util/u_cpu_detect.h"
     67 
     68 #include "lp_bld_type.h"
     69 #include "lp_bld_const.h"
     70 #include "lp_bld_arit.h"
     71 #include "lp_bld_pack.h"
     72 #include "lp_bld_conv.h"
     73 #include "lp_bld_logic.h"
     74 
     75 
     76 /**
     77  * Converts int16 half-float to float32
     78  * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
     79  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
     80  *
     81  * @param src_type      <vector> type of int16
     82  * @param src           value to convert
     83  *
     84  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
     85  */
     86 LLVMValueRef
     87 lp_build_half_to_float(struct gallivm_state *gallivm,
     88                                       struct lp_type src_type,
     89                                       LLVMValueRef src)
     90 {
     91    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
     92    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
     93 
     94    LLVMBuilderRef builder = gallivm->builder;
     95    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
     96    LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
     97 
     98    /* Constants */
     99    LLVMValueRef i32_13          = lp_build_const_int_vec(gallivm, i32_type, 13);
    100    LLVMValueRef i32_16          = lp_build_const_int_vec(gallivm, i32_type, 16);
    101    LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
    102    LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
    103    LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
    104    LLVMValueRef f32_magic       = LLVMBuildBitCast(builder,
    105                                                    lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
    106                                                    float_vec_type, "");
    107 
    108    /* Convert int16 vector to int32 vector by zero ext */
    109    LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");
    110 
    111    /* Exponent / mantissa bits */
    112    LLVMValueRef expmant       = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
    113    LLVMValueRef shifted       = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
    114 
    115    /* Exponent adjust */
    116    LLVMValueRef scaled        = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
    117 
    118    /* Make sure Inf/NaN survive */
    119    LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
    120    LLVMValueRef infnanexp     = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
    121 
    122    /* Sign bit */
    123    LLVMValueRef justsign      = LLVMBuildXor(builder, h, expmant, "");
    124    LLVMValueRef sign          = LLVMBuildShl(builder, justsign, i32_16, "");
    125 
    126    /* Combine result */
    127    LLVMValueRef sign_inf      = LLVMBuildOr(builder, sign, infnanexp, "");
    128    LLVMValueRef final         = LLVMBuildOr(builder, scaled, sign_inf, "");
    129 
    130    /* Cast from int32 vector to float32 vector */
    131    return LLVMBuildBitCast(builder, final, float_vec_type, "");
    132 }
    133 
    134 
    135 /**
    136  * Special case for converting clamped IEEE-754 floats to unsigned norms.
    137  *
    138  * The mathematical voodoo below may seem excessive but it is actually
    139  * paramount we do it this way for several reasons. First, there is no single
    140  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
    141  * secondly, even if there was, since the FP's mantissa takes only a fraction
    142  * of register bits the typically scale and cast approach would require double
    143  * precision for accurate results, and therefore half the throughput
    144  *
    145  * Although the result values can be scaled to an arbitrary bit width specified
    146  * by dst_width, the actual result type will have the same width.
    147  *
    148  * Ex: src = { float, float, float, float }
    149  * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
    150  */
    151 LLVMValueRef
    152 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
    153                                         struct lp_type src_type,
    154                                         unsigned dst_width,
    155                                         LLVMValueRef src)
    156 {
    157    LLVMBuilderRef builder = gallivm->builder;
    158    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
    159    LLVMValueRef res;
    160    unsigned mantissa;
    161 
    162    assert(src_type.floating);
    163    assert(dst_width <= src_type.width);
    164    src_type.sign = FALSE;
    165 
    166    mantissa = lp_mantissa(src_type);
    167 
    168    if (dst_width <= mantissa) {
    169       /*
    170        * Apply magic coefficients that will make the desired result to appear
    171        * in the lowest significant bits of the mantissa, with correct rounding.
    172        *
    173        * This only works if the destination width fits in the mantissa.
    174        */
    175 
    176       unsigned long long ubound;
    177       unsigned long long mask;
    178       double scale;
    179       double bias;
    180 
    181       ubound = (1ULL << dst_width);
    182       mask = ubound - 1;
    183       scale = (double)mask/ubound;
    184       bias = (double)(1ULL << (mantissa - dst_width));
    185 
    186       res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
    187       res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
    188       res = LLVMBuildBitCast(builder, res, int_vec_type, "");
    189       res = LLVMBuildAnd(builder, res,
    190                          lp_build_const_int_vec(gallivm, src_type, mask), "");
    191    }
    192    else if (dst_width == (mantissa + 1)) {
    193       /*
    194        * The destination width matches exactly what can be represented in
    195        * floating point (i.e., mantissa + 1 bits). So do a straight
    196        * multiplication followed by casting. No further rounding is necessary.
    197        */
    198 
    199       double scale;
    200 
    201       scale = (double)((1ULL << dst_width) - 1);
    202 
    203       res = LLVMBuildFMul(builder, src,
    204                           lp_build_const_vec(gallivm, src_type, scale), "");
    205       res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
    206    }
    207    else {
    208       /*
    209        * The destination exceeds what can be represented in the floating point.
    210        * So multiply by the largest power two we get away with, and when
    211        * subtract the most significant bit to rescale to normalized values.
    212        *
    213        * The largest power of two factor we can get away is
    214        * (1 << (src_type.width - 1)), because we need to use signed . In theory it
    215        * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
    216        * INT_MIN should be returned in FPToSI, which is the correct result for
    217        * values near 1.0!
    218        *
    219        * This means we get (src_type.width - 1) correct bits for values near 0.0,
    220        * and (mantissa + 1) correct bits for values near 1.0. Equally or more
    221        * important, we also get exact results for 0.0 and 1.0.
    222        */
    223 
    224       unsigned n = MIN2(src_type.width - 1, dst_width);
    225 
    226       double scale = (double)(1ULL << n);
    227       unsigned lshift = dst_width - n;
    228       unsigned rshift = n;
    229       LLVMValueRef lshifted;
    230       LLVMValueRef rshifted;
    231 
    232       res = LLVMBuildFMul(builder, src,
    233                           lp_build_const_vec(gallivm, src_type, scale), "");
    234       res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
    235 
    236       /*
    237        * Align the most significant bit to its final place.
    238        *
    239        * This will cause 1.0 to overflow to 0, but the later adjustment will
    240        * get it right.
    241        */
    242       if (lshift) {
    243          lshifted = LLVMBuildShl(builder, res,
    244                                  lp_build_const_int_vec(gallivm, src_type,
    245                                                         lshift), "");
    246       } else {
    247          lshifted = res;
    248       }
    249 
    250       /*
    251        * Align the most significant bit to the right.
    252        */
    253       rshifted =  LLVMBuildLShr(builder, res,
    254                                 lp_build_const_int_vec(gallivm, src_type, rshift),
    255                                 "");
    256 
    257       /*
    258        * Subtract the MSB to the LSB, therefore re-scaling from
    259        * (1 << dst_width) to ((1 << dst_width) - 1).
    260        */
    261 
    262       res = LLVMBuildSub(builder, lshifted, rshifted, "");
    263    }
    264 
    265    return res;
    266 }
    267 
    268 
    269 /**
    270  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
    271  * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
    272  * return {float, float, float, float} with values in range [0, 1].
    273  */
    274 LLVMValueRef
    275 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
    276                                 unsigned src_width,
    277                                 struct lp_type dst_type,
    278                                 LLVMValueRef src)
    279 {
    280    LLVMBuilderRef builder = gallivm->builder;
    281    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
    282    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
    283    LLVMValueRef bias_;
    284    LLVMValueRef res;
    285    unsigned mantissa;
    286    unsigned n;
    287    unsigned long long ubound;
    288    unsigned long long mask;
    289    double scale;
    290    double bias;
    291 
    292    assert(dst_type.floating);
    293 
    294    mantissa = lp_mantissa(dst_type);
    295 
    296    if (src_width <= (mantissa + 1)) {
    297       /*
    298        * The source width matches fits what can be represented in floating
    299        * point (i.e., mantissa + 1 bits). So do a straight multiplication
    300        * followed by casting. No further rounding is necessary.
    301        */
    302 
    303       scale = 1.0/(double)((1ULL << src_width) - 1);
    304       res = LLVMBuildSIToFP(builder, src, vec_type, "");
    305       res = LLVMBuildFMul(builder, res,
    306                           lp_build_const_vec(gallivm, dst_type, scale), "");
    307       return res;
    308    }
    309    else {
    310       /*
    311        * The source width exceeds what can be represented in floating
    312        * point. So truncate the incoming values.
    313        */
    314 
    315       n = MIN2(mantissa, src_width);
    316 
    317       ubound = ((unsigned long long)1 << n);
    318       mask = ubound - 1;
    319       scale = (double)ubound/mask;
    320       bias = (double)((unsigned long long)1 << (mantissa - n));
    321 
    322       res = src;
    323 
    324       if (src_width > mantissa) {
    325          int shift = src_width - mantissa;
    326          res = LLVMBuildLShr(builder, res,
    327                              lp_build_const_int_vec(gallivm, dst_type, shift), "");
    328       }
    329 
    330       bias_ = lp_build_const_vec(gallivm, dst_type, bias);
    331 
    332       res = LLVMBuildOr(builder,
    333                         res,
    334                         LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
    335 
    336       res = LLVMBuildBitCast(builder, res, vec_type, "");
    337 
    338       res = LLVMBuildFSub(builder, res, bias_, "");
    339       res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
    340    }
    341 
    342    return res;
    343 }
    344 
    345 
    346 /**
    347  * Generic type conversion.
    348  *
    349  * TODO: Take a precision argument, or even better, add a new precision member
    350  * to the lp_type union.
    351  */
    352 void
    353 lp_build_conv(struct gallivm_state *gallivm,
    354               struct lp_type src_type,
    355               struct lp_type dst_type,
    356               const LLVMValueRef *src, unsigned num_srcs,
    357               LLVMValueRef *dst, unsigned num_dsts)
    358 {
    359    LLVMBuilderRef builder = gallivm->builder;
    360    struct lp_type tmp_type;
    361    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    362    unsigned num_tmps;
    363    unsigned i;
    364 
    365    /* We must not loose or gain channels. Only precision */
    366    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
    367 
    368    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
    369    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
    370    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
    371    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
    372 
    373    tmp_type = src_type;
    374    for(i = 0; i < num_srcs; ++i) {
    375       assert(lp_check_value(src_type, src[i]));
    376       tmp[i] = src[i];
    377    }
    378    num_tmps = num_srcs;
    379 
    380 
    381    /* Special case 4x4f --> 1x16ub
    382     */
    383    if (src_type.floating == 1 &&
    384        src_type.fixed    == 0 &&
    385        src_type.sign     == 1 &&
    386        src_type.norm     == 0 &&
    387        src_type.width    == 32 &&
    388        src_type.length   == 4 &&
    389 
    390        dst_type.floating == 0 &&
    391        dst_type.fixed    == 0 &&
    392        dst_type.sign     == 0 &&
    393        dst_type.norm     == 1 &&
    394        dst_type.width    == 8 &&
    395        dst_type.length   == 16 &&
    396 
    397        4 * num_dsts      == num_srcs &&
    398 
    399        util_cpu_caps.has_sse2)
    400    {
    401       struct lp_build_context bld;
    402       struct lp_type int16_type = dst_type;
    403       struct lp_type int32_type = dst_type;
    404       LLVMValueRef const_255f;
    405       unsigned i, j;
    406 
    407       lp_build_context_init(&bld, gallivm, src_type);
    408 
    409       int16_type.width *= 2;
    410       int16_type.length /= 2;
    411       int16_type.sign = 1;
    412 
    413       int32_type.width *= 4;
    414       int32_type.length /= 4;
    415       int32_type.sign = 1;
    416 
    417       const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
    418 
    419       for (i = 0; i < num_dsts; ++i, src += 4) {
    420          LLVMValueRef lo, hi;
    421 
    422          for (j = 0; j < 4; ++j) {
    423             tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
    424             tmp[j] = lp_build_iround(&bld, tmp[j]);
    425          }
    426 
    427          /* relying on clamping behavior of sse2 intrinsics here */
    428          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
    429          hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
    430          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
    431       }
    432 
    433       return;
    434    }
    435 
    436    /* Special case 2x8f --> 1x16ub
    437     */
    438    else if (src_type.floating == 1 &&
    439       src_type.fixed    == 0 &&
    440       src_type.sign     == 1 &&
    441       src_type.norm     == 0 &&
    442       src_type.width    == 32 &&
    443       src_type.length   == 8 &&
    444 
    445       dst_type.floating == 0 &&
    446       dst_type.fixed    == 0 &&
    447       dst_type.sign     == 0 &&
    448       dst_type.norm     == 1 &&
    449       dst_type.width    == 8 &&
    450       dst_type.length   == 16 &&
    451 
    452       2 * num_dsts      == num_srcs &&
    453 
    454       util_cpu_caps.has_avx) {
    455 
    456       struct lp_build_context bld;
    457       struct lp_type int16_type = dst_type;
    458       struct lp_type int32_type = dst_type;
    459       LLVMValueRef const_255f;
    460       unsigned i;
    461 
    462       lp_build_context_init(&bld, gallivm, src_type);
    463 
    464       int16_type.width *= 2;
    465       int16_type.length /= 2;
    466       int16_type.sign = 1;
    467 
    468       int32_type.width *= 4;
    469       int32_type.length /= 4;
    470       int32_type.sign = 1;
    471 
    472       const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
    473 
    474       for (i = 0; i < num_dsts; ++i, src += 2) {
    475          LLVMValueRef lo, hi, a, b;
    476 
    477          a = LLVMBuildFMul(builder, src[0], const_255f, "");
    478          b = LLVMBuildFMul(builder, src[1], const_255f, "");
    479 
    480          a = lp_build_iround(&bld, a);
    481          b = lp_build_iround(&bld, b);
    482 
    483          tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
    484          tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
    485          tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
    486          tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
    487 
    488          /* relying on clamping behavior of sse2 intrinsics here */
    489          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
    490          hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
    491          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
    492       }
    493       return;
    494    }
    495 
    496    /* Pre convert half-floats to floats
    497     */
    498    else if (src_type.floating && src_type.width == 16)
    499    {
    500       for(i = 0; i < num_tmps; ++i)
    501          tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
    502 
    503       tmp_type.width = 32;
    504    }
    505 
    506    /*
    507     * Clamp if necessary
    508     */
    509 
    510    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
    511       struct lp_build_context bld;
    512       double src_min = lp_const_min(src_type);
    513       double dst_min = lp_const_min(dst_type);
    514       double src_max = lp_const_max(src_type);
    515       double dst_max = lp_const_max(dst_type);
    516       LLVMValueRef thres;
    517 
    518       lp_build_context_init(&bld, gallivm, tmp_type);
    519 
    520       if(src_min < dst_min) {
    521          if(dst_min == 0.0)
    522             thres = bld.zero;
    523          else
    524             thres = lp_build_const_vec(gallivm, src_type, dst_min);
    525          for(i = 0; i < num_tmps; ++i)
    526             tmp[i] = lp_build_max(&bld, tmp[i], thres);
    527       }
    528 
    529       if(src_max > dst_max) {
    530          if(dst_max == 1.0)
    531             thres = bld.one;
    532          else
    533             thres = lp_build_const_vec(gallivm, src_type, dst_max);
    534          for(i = 0; i < num_tmps; ++i)
    535             tmp[i] = lp_build_min(&bld, tmp[i], thres);
    536       }
    537    }
    538 
    539    /*
    540     * Scale to the narrowest range
    541     */
    542 
    543    if(dst_type.floating) {
    544       /* Nothing to do */
    545    }
    546    else if(tmp_type.floating) {
    547       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
    548          for(i = 0; i < num_tmps; ++i) {
    549             tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
    550                                                              tmp_type,
    551                                                              dst_type.width,
    552                                                              tmp[i]);
    553          }
    554          tmp_type.floating = FALSE;
    555       }
    556       else {
    557          double dst_scale = lp_const_scale(dst_type);
    558          LLVMTypeRef tmp_vec_type;
    559 
    560          if (dst_scale != 1.0) {
    561             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
    562             for(i = 0; i < num_tmps; ++i)
    563                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
    564          }
    565 
    566          /* Use an equally sized integer for intermediate computations */
    567          tmp_type.floating = FALSE;
    568          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
    569          for(i = 0; i < num_tmps; ++i) {
    570 #if 0
    571             if(dst_type.sign)
    572                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
    573             else
    574                tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
    575 #else
    576            /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
    577             tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
    578 #endif
    579          }
    580       }
    581    }
    582    else {
    583       unsigned src_shift = lp_const_shift(src_type);
    584       unsigned dst_shift = lp_const_shift(dst_type);
    585       unsigned src_offset = lp_const_offset(src_type);
    586       unsigned dst_offset = lp_const_offset(dst_type);
    587 
    588       /* Compensate for different offsets */
    589       if (dst_offset > src_offset && src_type.width > dst_type.width) {
    590          for (i = 0; i < num_tmps; ++i) {
    591             LLVMValueRef shifted;
    592             LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
    593             if(src_type.sign)
    594                shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
    595             else
    596                shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
    597 
    598             tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
    599          }
    600       }
    601 
    602       if(src_shift > dst_shift) {
    603          LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
    604                                                      src_shift - dst_shift);
    605          for(i = 0; i < num_tmps; ++i)
    606             if(src_type.sign)
    607                tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
    608             else
    609                tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
    610       }
    611    }
    612 
    613    /*
    614     * Truncate or expand bit width
    615     *
    616     * No data conversion should happen here, although the sign bits are
    617     * crucial to avoid bad clamping.
    618     */
    619 
    620    {
    621       struct lp_type new_type;
    622 
    623       new_type = tmp_type;
    624       new_type.sign   = dst_type.sign;
    625       new_type.width  = dst_type.width;
    626       new_type.length = dst_type.length;
    627 
    628       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
    629 
    630       tmp_type = new_type;
    631       num_tmps = num_dsts;
    632    }
    633 
    634    /*
    635     * Scale to the widest range
    636     */
    637 
    638    if(src_type.floating) {
    639       /* Nothing to do */
    640    }
    641    else if(!src_type.floating && dst_type.floating) {
    642       if(!src_type.fixed && !src_type.sign && src_type.norm) {
    643          for(i = 0; i < num_tmps; ++i) {
    644             tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
    645                                                      src_type.width,
    646                                                      dst_type,
    647                                                      tmp[i]);
    648          }
    649          tmp_type.floating = TRUE;
    650       }
    651       else {
    652          double src_scale = lp_const_scale(src_type);
    653          LLVMTypeRef tmp_vec_type;
    654 
    655          /* Use an equally sized integer for intermediate computations */
    656          tmp_type.floating = TRUE;
    657          tmp_type.sign = TRUE;
    658          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
    659          for(i = 0; i < num_tmps; ++i) {
    660 #if 0
    661             if(dst_type.sign)
    662                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
    663             else
    664                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
    665 #else
    666             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
    667             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
    668 #endif
    669           }
    670 
    671           if (src_scale != 1.0) {
    672              LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
    673              for(i = 0; i < num_tmps; ++i)
    674                 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
    675           }
    676       }
    677     }
    678     else {
    679        unsigned src_shift = lp_const_shift(src_type);
    680        unsigned dst_shift = lp_const_shift(dst_type);
    681        unsigned src_offset = lp_const_offset(src_type);
    682        unsigned dst_offset = lp_const_offset(dst_type);
    683 
    684        if (src_shift < dst_shift) {
    685           LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
    686           LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
    687 
    688           for (i = 0; i < num_tmps; ++i) {
    689              pre_shift[i] = tmp[i];
    690              tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
    691           }
    692 
    693           /* Compensate for different offsets */
    694           if (dst_offset > src_offset) {
    695              for (i = 0; i < num_tmps; ++i) {
    696                 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
    697              }
    698           }
    699        }
    700     }
    701 
    702    for(i = 0; i < num_dsts; ++i) {
    703       dst[i] = tmp[i];
    704       assert(lp_check_value(dst_type, dst[i]));
    705    }
    706 }
    707 
    708 
    709 /**
    710  * Bit mask conversion.
    711  *
    712  * This will convert the integer masks that match the given types.
    713  *
    714  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
    715  * Any other value will likely cause unpredictable results.
    716  *
    717  * This is basically a very trimmed down version of lp_build_conv.
    718  */
    719 void
    720 lp_build_conv_mask(struct gallivm_state *gallivm,
    721                    struct lp_type src_type,
    722                    struct lp_type dst_type,
    723                    const LLVMValueRef *src, unsigned num_srcs,
    724                    LLVMValueRef *dst, unsigned num_dsts)
    725 {
    726 
    727    /* We must not loose or gain channels. Only precision */
    728    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
    729 
    730    /*
    731     * Drop
    732     *
    733     * We assume all values are 0 or -1
    734     */
    735 
    736    src_type.floating = FALSE;
    737    src_type.fixed = FALSE;
    738    src_type.sign = TRUE;
    739    src_type.norm = FALSE;
    740 
    741    dst_type.floating = FALSE;
    742    dst_type.fixed = FALSE;
    743    dst_type.sign = TRUE;
    744    dst_type.norm = FALSE;
    745 
    746    /*
    747     * Truncate or expand bit width
    748     */
    749 
    750    lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
    751 }
    752