Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2010 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
     18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     21  *
     22  * The above copyright notice and this permission notice (including the
     23  * next paragraph) shall be included in all copies or substantial portions
     24  * of the Software.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 #include "util/u_debug.h"
     30 #include "util/u_cpu_detect.h"
     31 #include "util/u_math.h"
     32 #include "lp_bld_debug.h"
     33 #include "lp_bld_const.h"
     34 #include "lp_bld_format.h"
     35 #include "lp_bld_gather.h"
     36 #include "lp_bld_swizzle.h"
     37 #include "lp_bld_type.h"
     38 #include "lp_bld_init.h"
     39 #include "lp_bld_intr.h"
     40 #include "lp_bld_pack.h"
     41 
     42 
     43 /**
     44  * Get the pointer to one element from scatter positions in memory.
     45  *
     46  * @sa lp_build_gather()
     47  */
     48 LLVMValueRef
     49 lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
     50                          unsigned length,
     51                          LLVMValueRef base_ptr,
     52                          LLVMValueRef offsets,
     53                          unsigned i)
     54 {
     55    LLVMValueRef offset;
     56    LLVMValueRef ptr;
     57 
     58    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
     59 
     60    if (length == 1) {
     61       assert(i == 0);
     62       offset = offsets;
     63    } else {
     64       LLVMValueRef index = lp_build_const_int32(gallivm, i);
     65       offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
     66    }
     67 
     68    ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
     69 
     70    return ptr;
     71 }
     72 
     73 
     74 /**
     75  * Gather one element from scatter positions in memory.
     76  *
     77  * @sa lp_build_gather()
     78  */
     79 LLVMValueRef
     80 lp_build_gather_elem(struct gallivm_state *gallivm,
     81                      unsigned length,
     82                      unsigned src_width,
     83                      unsigned dst_width,
     84                      boolean aligned,
     85                      LLVMValueRef base_ptr,
     86                      LLVMValueRef offsets,
     87                      unsigned i,
     88                      boolean vector_justify)
     89 {
     90    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
     91    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
     92    LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
     93    LLVMValueRef ptr;
     94    LLVMValueRef res;
     95 
     96    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
     97 
     98    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
     99    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
    100    res = LLVMBuildLoad(gallivm->builder, ptr, "");
    101 
    102    /* XXX
    103     * On some archs we probably really want to avoid having to deal
    104     * with alignments lower than 4 bytes (if fetch size is a power of
    105     * two >= 32). On x86 it doesn't matter, however.
    106     * We should be able to guarantee full alignment for any kind of texture
    107     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
    108     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
    109     * but I don't think that's quite what we wanted).
    110     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
    111     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
    112     * enforcing what we want (which is what d3d10 does, the offset needs to
    113     * be aligned to element size, but GL has bytes regardless of element
    114     * size which would only leave us with minimum alignment restriction of 16
    115     * which doesn't make much sense if the type isn't 4x32bit). Due to
    116     * translation of offsets to first_elem in sampler_views it actually seems
    117     * gallium could not do anything else except 16 no matter what...
    118     */
    119    if (!aligned) {
    120       LLVMSetAlignment(res, 1);
    121    } else if (!util_is_power_of_two(src_width)) {
    122       /*
    123        * Full alignment is impossible, assume the caller really meant
    124        * the individual elements were aligned (e.g. 3x32bit format).
    125        * And yes the generated code may otherwise crash, llvm will
    126        * really assume 128bit alignment with a 96bit fetch (I suppose
    127        * that makes sense as it can just assume the upper 32bit to be
    128        * whatever).
    129        * Maybe the caller should be able to explicitly set this, but
    130        * this should cover all the 3-channel formats.
    131        */
    132       if (((src_width / 24) * 24 == src_width) &&
    133            util_is_power_of_two(src_width / 24)) {
    134           LLVMSetAlignment(res, src_width / 24);
    135       } else {
    136          LLVMSetAlignment(res, 1);
    137       }
    138    }
    139 
    140    assert(src_width <= dst_width);
    141    if (src_width < dst_width) {
    142       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
    143       if (vector_justify) {
    144 #ifdef PIPE_ARCH_BIG_ENDIAN
    145          res = LLVMBuildShl(gallivm->builder, res,
    146                             LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
    147 #endif
    148       }
    149    }
    150 
    151    return res;
    152 }
    153 
    154 
    155 /**
    156  * Gather one element from scatter positions in memory.
    157  * Nearly the same as above, however the individual elements
    158  * may be vectors themselves, and fetches may be float type.
    159  * Can also do pad vector instead of ZExt.
    160  *
    161  * @sa lp_build_gather()
    162  */
    163 static LLVMValueRef
    164 lp_build_gather_elem_vec(struct gallivm_state *gallivm,
    165                          unsigned length,
    166                          unsigned src_width,
    167                          LLVMTypeRef src_type,
    168                          struct lp_type dst_type,
    169                          boolean aligned,
    170                          LLVMValueRef base_ptr,
    171                          LLVMValueRef offsets,
    172                          unsigned i,
    173                          boolean vector_justify)
    174 {
    175    LLVMValueRef ptr, res;
    176    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
    177    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
    178 
    179    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
    180    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
    181    res = LLVMBuildLoad(gallivm->builder, ptr, "");
    182 
    183    /* XXX
    184     * On some archs we probably really want to avoid having to deal
    185     * with alignments lower than 4 bytes (if fetch size is a power of
    186     * two >= 32). On x86 it doesn't matter, however.
    187     * We should be able to guarantee full alignment for any kind of texture
    188     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
    189     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
    190     * but I don't think that's quite what we wanted).
    191     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
    192     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
    193     * enforcing what we want (which is what d3d10 does, the offset needs to
    194     * be aligned to element size, but GL has bytes regardless of element
    195     * size which would only leave us with minimum alignment restriction of 16
    196     * which doesn't make much sense if the type isn't 4x32bit). Due to
    197     * translation of offsets to first_elem in sampler_views it actually seems
    198     * gallium could not do anything else except 16 no matter what...
    199     */
    200    if (!aligned) {
    201       LLVMSetAlignment(res, 1);
    202    } else if (!util_is_power_of_two(src_width)) {
    203       /*
    204        * Full alignment is impossible, assume the caller really meant
    205        * the individual elements were aligned (e.g. 3x32bit format).
    206        * And yes the generated code may otherwise crash, llvm will
    207        * really assume 128bit alignment with a 96bit fetch (I suppose
    208        * that makes sense as it can just assume the upper 32bit to be
    209        * whatever).
    210        * Maybe the caller should be able to explicitly set this, but
    211        * this should cover all the 3-channel formats.
    212        */
    213       if (((src_width / 24) * 24 == src_width) &&
    214            util_is_power_of_two(src_width / 24)) {
    215           LLVMSetAlignment(res, src_width / 24);
    216       } else {
    217          LLVMSetAlignment(res, 1);
    218       }
    219    }
    220 
    221    assert(src_width <= dst_type.width * dst_type.length);
    222    if (src_width < dst_type.width * dst_type.length) {
    223       if (dst_type.length > 1) {
    224          res = lp_build_pad_vector(gallivm, res, dst_type.length);
    225          /*
    226           * vector_justify hopefully a non-issue since we only deal
    227           * with src_width >= 32 here?
    228           */
    229       } else {
    230          LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
    231 
    232          /*
    233           * Only valid if src_ptr_type is int type...
    234           */
    235          res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
    236 
    237          if (vector_justify) {
    238 #ifdef PIPE_ARCH_BIG_ENDIAN
    239          res = LLVMBuildShl(gallivm->builder, res,
    240                             LLVMConstInt(dst_elem_type,
    241                                          dst_type.width - src_width, 0), "");
    242 #endif
    243          }
    244       }
    245    }
    246    return res;
    247 }
    248 
    249 
    250 
    251 
    252 static LLVMValueRef
    253 lp_build_gather_avx2(struct gallivm_state *gallivm,
    254                      unsigned length,
    255                      unsigned src_width,
    256                      struct lp_type dst_type,
    257                      LLVMValueRef base_ptr,
    258                      LLVMValueRef offsets)
    259 {
    260    LLVMBuilderRef builder = gallivm->builder;
    261    LLVMTypeRef src_type, src_vec_type;
    262    LLVMValueRef res;
    263    struct lp_type res_type = dst_type;
    264    res_type.length *= length;
    265 
    266    if (dst_type.floating) {
    267       src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
    268                                    LLVMFloatTypeInContext(gallivm->context);
    269    } else {
    270       src_type = LLVMIntTypeInContext(gallivm->context, src_width);
    271    }
    272    src_vec_type = LLVMVectorType(src_type, length);
    273 
    274    /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
    275    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
    276 
    277    if (0) {
    278       /*
    279        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
    280        * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
    281        * least with Haswell. See
    282        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
    283        * And the generated code doing the emulation is quite a bit worse
    284        * than what we get by doing it ourselves too.
    285        */
    286       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
    287       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
    288       LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
    289       LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
    290       LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
    291       LLVMValueRef src_ptr;
    292 
    293       base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
    294 
    295       /* Rescale offsets from bytes to elements */
    296       LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
    297       scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
    298       assert(LLVMTypeOf(offsets) == i32_vec_type);
    299       offsets = LLVMBuildSDiv(builder, offsets, scale, "");
    300 
    301       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
    302 
    303       char intrinsic[64];
    304       util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
    305                     length, dst_type.floating ? "f" : "i", src_width);
    306       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
    307       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
    308       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
    309 
    310       LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
    311 
    312       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
    313    } else {
    314       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
    315       const char *intrinsic = NULL;
    316       unsigned l_idx = 0;
    317 
    318       assert(src_width == 32 || src_width == 64);
    319       if (src_width == 32) {
    320          assert(length == 4 || length == 8);
    321       } else {
    322          assert(length == 2 || length == 4);
    323       }
    324 
    325       static const char *intrinsics[2][2][2] = {
    326 
    327          {{"llvm.x86.avx2.gather.d.d",
    328            "llvm.x86.avx2.gather.d.d.256"},
    329           {"llvm.x86.avx2.gather.d.q",
    330            "llvm.x86.avx2.gather.d.q.256"}},
    331 
    332          {{"llvm.x86.avx2.gather.d.ps",
    333            "llvm.x86.avx2.gather.d.ps.256"},
    334           {"llvm.x86.avx2.gather.d.pd",
    335            "llvm.x86.avx2.gather.d.pd.256"}},
    336       };
    337 
    338       if ((src_width == 32 && length == 8) ||
    339           (src_width == 64 && length == 4)) {
    340          l_idx = 1;
    341       }
    342       intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
    343 
    344       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
    345       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
    346       mask = LLVMConstBitCast(mask, src_vec_type);
    347       LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
    348 
    349       LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
    350 
    351       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
    352    }
    353    res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
    354 
    355    return res;
    356 }
    357 
    358 
    359 /**
    360  * Gather elements from scatter positions in memory into a single vector.
    361  * Use for fetching texels from a texture.
    362  * For SSE, typical values are length=4, src_width=32, dst_width=32.
    363  *
    364  * When src_width < dst_width, the return value can be justified in
    365  * one of two ways:
    366  * "integer justification" is used when the caller treats the destination
    367  * as a packed integer bitmask, as described by the channels' "shift" and
    368  * "width" fields;
    369  * "vector justification" is used when the caller casts the destination
    370  * to a vector and needs channel X to be in vector element 0.
    371  *
    372  * @param length length of the offsets
    373  * @param src_width src element width in bits
    374  * @param dst_type result element type (src will be expanded to fit,
    375  *        but truncation is not allowed)
    376  *        (this may be a vector, must be pot sized)
    377  * @param aligned whether the data is guaranteed to be aligned (to src_width)
    378  * @param base_ptr base pointer, needs to be a i8 pointer type.
    379  * @param offsets vector with offsets
    380  * @param vector_justify select vector rather than integer justification
    381  */
    382 LLVMValueRef
    383 lp_build_gather(struct gallivm_state *gallivm,
    384                 unsigned length,
    385                 unsigned src_width,
    386                 struct lp_type dst_type,
    387                 boolean aligned,
    388                 LLVMValueRef base_ptr,
    389                 LLVMValueRef offsets,
    390                 boolean vector_justify)
    391 {
    392    LLVMValueRef res;
    393    boolean need_expansion = src_width < dst_type.width * dst_type.length;
    394    boolean vec_fetch;
    395    struct lp_type fetch_type, fetch_dst_type;
    396    LLVMTypeRef src_type;
    397 
    398    assert(src_width <= dst_type.width * dst_type.length);
    399 
    400    /*
    401     * This is quite a mess...
    402     * Figure out if the fetch should be done as:
    403     * a) scalar or vector
    404     * b) float or int
    405     *
    406     * As an example, for a 96bit fetch expanded into 4x32bit, it is better
    407     * to use (3x32bit) vector type (then pad the vector). Otherwise, the
    408     * zext will cause extra instructions.
    409     * However, the same isn't true for 3x16bit (the codegen for that is
    410     * completely worthless on x86 simd, and for 3x8bit is is way worse
    411     * still, don't try that... (To get really good code out of llvm for
    412     * these cases, the only way is to decompose the fetches manually
    413     * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
    414     * case requires sse41, otherwise simple scalar zext is way better.
    415     * But probably not important enough, so don't bother.)
    416     * Also, we try to honor the floating bit of destination (but isn't
    417     * possible if caller asks for instance for 2x32bit dst_type with
    418     * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
    419     * cast to 2x32f type, so the fetch is always int and on top of that
    420     * we avoid the vec pad and use scalar zext due the above mentioned
    421     * issue).
    422     * Note this is optimized for x86 sse2 and up backend. Could be tweaked
    423     * for other archs if necessary...
    424     */
    425    if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
    426        (dst_type.length > 1)) {
    427       /* use vector fetch (if dst_type is vector) */
    428       vec_fetch = TRUE;
    429       if (dst_type.floating) {
    430          fetch_type = lp_type_float_vec(dst_type.width, src_width);
    431       } else {
    432          fetch_type = lp_type_int_vec(dst_type.width, src_width);
    433       }
    434       /* intentionally not using lp_build_vec_type here */
    435       src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
    436                                 fetch_type.length);
    437       fetch_dst_type = fetch_type;
    438       fetch_dst_type.length = dst_type.length;
    439     } else {
    440       /* use scalar fetch */
    441       vec_fetch = FALSE;
    442       if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
    443          fetch_type = lp_type_float(src_width);
    444       } else {
    445          fetch_type = lp_type_int(src_width);
    446       }
    447       src_type = lp_build_vec_type(gallivm, fetch_type);
    448       fetch_dst_type = fetch_type;
    449       fetch_dst_type.width = dst_type.width * dst_type.length;
    450    }
    451 
    452    if (length == 1) {
    453       /* Scalar */
    454       res = lp_build_gather_elem_vec(gallivm, length,
    455                                      src_width, src_type, fetch_dst_type,
    456                                      aligned, base_ptr, offsets, 0,
    457                                      vector_justify);
    458       return LLVMBuildBitCast(gallivm->builder, res,
    459                               lp_build_vec_type(gallivm, dst_type), "");
    460       /*
    461        * Excluding expansion from these paths because if you need it for
    462        * 32bit/64bit fetches you're doing it wrong (this is gather, not
    463        * conversion) and it would be awkward for floats.
    464        */
    465    } else if (util_cpu_caps.has_avx2 && !need_expansion &&
    466               src_width == 32 && (length == 4 || length == 8)) {
    467       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
    468                                   base_ptr, offsets);
    469    /*
    470     * This looks bad on paper wrt throughtput/latency on Haswell.
    471     * Even on Broadwell it doesn't look stellar.
    472     * Albeit no measurements were done (but tested to work).
    473     * Should definitely enable on Skylake.
    474     * (In general, should be more of a win if the fetch is 256bit wide -
    475     * this is true for the 32bit case above too.)
    476     */
    477    } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
    478               src_width == 64 && (length == 2 || length == 4)) {
    479       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
    480                                   base_ptr, offsets);
    481    } else {
    482       /* Vector */
    483 
    484       LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
    485       unsigned i;
    486       boolean vec_zext = FALSE;
    487       struct lp_type res_type, gather_res_type;
    488       LLVMTypeRef res_t, gather_res_t;
    489 
    490       res_type = fetch_dst_type;
    491       res_type.length *= length;
    492       gather_res_type = res_type;
    493 
    494       if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
    495          /*
    496           * Note that llvm is never able to optimize zext/insert combos
    497           * directly (i.e. zero the simd reg, then place the elements into
    498           * the appropriate place directly). (I think this has to do with
    499           * scalar/vector transition.) And scalar 16->32bit zext simd loads
    500           * aren't possible (instead loading to scalar reg first).
    501           * No idea about other archs...
    502           * We could do this manually, but instead we just use a vector
    503           * zext, which is simple enough (and, in fact, llvm might optimize
    504           * this away).
    505           * (We're not trying that with other bit widths as that might not be
    506           * easier, in particular with 8 bit values at least with only sse2.)
    507           */
    508          assert(vec_fetch == FALSE);
    509          gather_res_type.width /= 2;
    510          fetch_dst_type = fetch_type;
    511          src_type = lp_build_vec_type(gallivm, fetch_type);
    512          vec_zext = TRUE;
    513       }
    514       res_t = lp_build_vec_type(gallivm, res_type);
    515       gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
    516       res = LLVMGetUndef(gather_res_t);
    517       for (i = 0; i < length; ++i) {
    518          LLVMValueRef index = lp_build_const_int32(gallivm, i);
    519          elems[i] = lp_build_gather_elem_vec(gallivm, length,
    520                                              src_width, src_type, fetch_dst_type,
    521                                              aligned, base_ptr, offsets, i,
    522                                              vector_justify);
    523          if (!vec_fetch) {
    524             res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
    525          }
    526       }
    527       if (vec_zext) {
    528          res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
    529          if (vector_justify) {
    530 #ifdef PIPE_ARCH_BIG_ENDIAN
    531             unsigned sv = dst_type.width - src_width;
    532             res = LLVMBuildShl(gallivm->builder, res,
    533                                lp_build_const_int_vec(gallivm, res_type, sv), "");
    534 #endif
    535          }
    536       }
    537       if (vec_fetch) {
    538          /*
    539           * Do bitcast now otherwise llvm might get some funny ideas wrt
    540           * float/int types...
    541           */
    542          for (i = 0; i < length; i++) {
    543             elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
    544                                         lp_build_vec_type(gallivm, dst_type), "");
    545          }
    546          res = lp_build_concat(gallivm, elems, dst_type, length);
    547       } else {
    548          struct lp_type really_final_type = dst_type;
    549          assert(res_type.length * res_type.width ==
    550                 dst_type.length * dst_type.width * length);
    551          really_final_type.length *= length;
    552          res = LLVMBuildBitCast(gallivm->builder, res,
    553                                 lp_build_vec_type(gallivm, really_final_type), "");
    554       }
    555    }
    556 
    557    return res;
    558 }
    559 
    560 LLVMValueRef
    561 lp_build_gather_values(struct gallivm_state * gallivm,
    562                        LLVMValueRef * values,
    563                        unsigned value_count)
    564 {
    565    LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
    566    LLVMBuilderRef builder = gallivm->builder;
    567    LLVMValueRef vec = LLVMGetUndef(vec_type);
    568    unsigned i;
    569 
    570    for (i = 0; i < value_count; i++) {
    571       LLVMValueRef index = lp_build_const_int32(gallivm, i);
    572       vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
    573    }
    574    return vec;
    575 }
    576