Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 /**
     30  * @file
     31  * Helper functions for packing/unpacking.
     32  *
     33  * Pack/unpacking is necessary for conversion between types of different
     34  * bit width.
     35  *
     36  * They are also commonly used when an computation needs higher
     37  * precision for the intermediate values. For example, if one needs the
     38  * function:
     39  *
     40  *   c = compute(a, b);
     41  *
     42  * to use more precision for intermediate results then one should implement it
     43  * as:
     44  *
     45  *   LLVMValueRef
     46  *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
     47  *   {
     48  *      struct lp_type wide_type = lp_wider_type(type);
     49  *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
     50  *
     51  *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
     52  *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
     53  *
     54  *      cl = compute_half(al, bl);
     55  *      ch = compute_half(ah, bh);
     56  *
     57  *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
     58  *
     59  *      return c;
     60  *   }
     61  *
     62  * where compute_half() would do the computation for half the elements with
     63  * twice the precision.
     64  *
     65  * @author Jose Fonseca <jfonseca (at) vmware.com>
     66  */
     67 
     68 
     69 #include "util/u_debug.h"
     70 #include "util/u_math.h"
     71 #include "util/u_cpu_detect.h"
     72 #include "util/u_memory.h"
     73 
     74 #include "lp_bld_type.h"
     75 #include "lp_bld_const.h"
     76 #include "lp_bld_init.h"
     77 #include "lp_bld_intr.h"
     78 #include "lp_bld_arit.h"
     79 #include "lp_bld_pack.h"
     80 #include "lp_bld_swizzle.h"
     81 
     82 
     83 /**
     84  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
     85  */
     86 static LLVMValueRef
     87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
     88                               unsigned n, unsigned lo_hi)
     89 {
     90    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
     91    unsigned i, j;
     92 
     93    assert(n <= LP_MAX_VECTOR_LENGTH);
     94    assert(lo_hi < 2);
     95 
     96    /* TODO: cache results in a static table */
     97 
     98    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
     99       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
    100       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
    101    }
    102 
    103    return LLVMConstVector(elems, n);
    104 }
    105 
    106 /**
    107  * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
    108  * See comment above lp_build_interleave2_half for more details.
    109  */
    110 static LLVMValueRef
    111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
    112                                    unsigned n, unsigned lo_hi)
    113 {
    114    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
    115    unsigned i, j;
    116 
    117    assert(n <= LP_MAX_VECTOR_LENGTH);
    118    assert(lo_hi < 2);
    119 
    120    for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
    121       if (i == (n / 2))
    122          j += n / 4;
    123 
    124       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
    125       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
    126    }
    127 
    128    return LLVMConstVector(elems, n);
    129 }
    130 
    131 /**
    132  * Build shuffle vectors that match PACKxx (SSE) instructions or
    133  * VPERM (Altivec).
    134  */
    135 static LLVMValueRef
    136 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
    137 {
    138    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
    139    unsigned i;
    140 
    141    assert(n <= LP_MAX_VECTOR_LENGTH);
    142 
    143    for(i = 0; i < n; ++i)
    144 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    145       elems[i] = lp_build_const_int32(gallivm, 2*i);
    146 #else
    147       elems[i] = lp_build_const_int32(gallivm, 2*i+1);
    148 #endif
    149 
    150    return LLVMConstVector(elems, n);
    151 }
    152 
    153 /**
    154  * Return a vector with elements src[start:start+size]
    155  * Most useful for getting half the values out of a 256bit sized vector,
    156  * otherwise may cause data rearrangement to happen.
    157  */
    158 LLVMValueRef
    159 lp_build_extract_range(struct gallivm_state *gallivm,
    160                        LLVMValueRef src,
    161                        unsigned start,
    162                        unsigned size)
    163 {
    164    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
    165    unsigned i;
    166 
    167    assert(size <= ARRAY_SIZE(elems));
    168 
    169    for (i = 0; i < size; ++i)
    170       elems[i] = lp_build_const_int32(gallivm, i + start);
    171 
    172    if (size == 1) {
    173       return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
    174    }
    175    else {
    176       return LLVMBuildShuffleVector(gallivm->builder, src, src,
    177                                     LLVMConstVector(elems, size), "");
    178    }
    179 }
    180 
    181 /**
    182  * Concatenates several (must be a power of 2) vectors (of same type)
    183  * into a larger one.
    184  * Most useful for building up a 256bit sized vector out of two 128bit ones.
    185  */
    186 LLVMValueRef
    187 lp_build_concat(struct gallivm_state *gallivm,
    188                 LLVMValueRef src[],
    189                 struct lp_type src_type,
    190                 unsigned num_vectors)
    191 {
    192    unsigned new_length, i;
    193    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
    194    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
    195 
    196    assert(src_type.length * num_vectors <= ARRAY_SIZE(shuffles));
    197    assert(util_is_power_of_two(num_vectors));
    198 
    199    new_length = src_type.length;
    200 
    201    for (i = 0; i < num_vectors; i++)
    202       tmp[i] = src[i];
    203 
    204    while (num_vectors > 1) {
    205       num_vectors >>= 1;
    206       new_length <<= 1;
    207       for (i = 0; i < new_length; i++) {
    208          shuffles[i] = lp_build_const_int32(gallivm, i);
    209       }
    210       for (i = 0; i < num_vectors; i++) {
    211          tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
    212                                          LLVMConstVector(shuffles, new_length), "");
    213       }
    214    }
    215 
    216    return tmp[0];
    217 }
    218 
    219 
    220 /**
    221  * Combines vectors to reduce from num_srcs to num_dsts.
    222  * Returns the number of src vectors concatenated in a single dst.
    223  *
    224  * num_srcs must be exactly divisible by num_dsts.
    225  *
    226  * e.g. For num_srcs = 4 and src = [x, y, z, w]
    227  *          num_dsts = 1  dst = [xyzw]    return = 4
    228  *          num_dsts = 2  dst = [xy, zw]  return = 2
    229  */
    230 int
    231 lp_build_concat_n(struct gallivm_state *gallivm,
    232                   struct lp_type src_type,
    233                   LLVMValueRef *src,
    234                   unsigned num_srcs,
    235                   LLVMValueRef *dst,
    236                   unsigned num_dsts)
    237 {
    238    int size = num_srcs / num_dsts;
    239    unsigned i;
    240 
    241    assert(num_srcs >= num_dsts);
    242    assert((num_srcs % size) == 0);
    243 
    244    if (num_srcs == num_dsts) {
    245       for (i = 0; i < num_dsts; ++i) {
    246          dst[i] = src[i];
    247       }
    248       return 1;
    249    }
    250 
    251    for (i = 0; i < num_dsts; ++i) {
    252       dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
    253    }
    254 
    255    return size;
    256 }
    257 
    258 
    259 /**
    260  * Un-interleave vector.
    261  * This will return a vector consisting of every second element
    262  * (depending on lo_hi, beginning at 0 or 1).
    263  * The returned vector size (elems and width) will only be half
    264  * that of the source vector.
    265  */
    266 LLVMValueRef
    267 lp_build_uninterleave1(struct gallivm_state *gallivm,
    268                        unsigned num_elems,
    269                        LLVMValueRef a,
    270                        unsigned lo_hi)
    271 {
    272    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
    273    unsigned i;
    274    assert(num_elems <= LP_MAX_VECTOR_LENGTH);
    275 
    276    for (i = 0; i < num_elems / 2; ++i)
    277       elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
    278 
    279    shuffle = LLVMConstVector(elems, num_elems / 2);
    280 
    281    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
    282 }
    283 
    284 
    285 /**
    286  * Interleave vector elements.
    287  *
    288  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
    289  * (but not for 256bit AVX vectors).
    290  */
    291 LLVMValueRef
    292 lp_build_interleave2(struct gallivm_state *gallivm,
    293                      struct lp_type type,
    294                      LLVMValueRef a,
    295                      LLVMValueRef b,
    296                      unsigned lo_hi)
    297 {
    298    LLVMValueRef shuffle;
    299 
    300    if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
    301       /*
    302        * XXX: This is a workaround for llvm code generation deficiency. Strangely
    303        * enough, while this needs vinsertf128/vextractf128 instructions (hence
    304        * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
    305        * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
    306        * So use some different shuffles instead (the exact shuffles don't seem to
    307        * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
    308        */
    309       struct lp_type tmp_type = type;
    310       LLVMValueRef srchalf[2], tmpdst;
    311       tmp_type.length = 4;
    312       tmp_type.width = 64;
    313       a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
    314       b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
    315       srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
    316       srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
    317       tmp_type.length = 2;
    318       tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
    319       return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
    320    }
    321 
    322    shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
    323 
    324    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
    325 }
    326 
    327 /**
    328  * Interleave vector elements but with 256 bit,
    329  * treats it as interleave with 2 concatenated 128 bit vectors.
    330  *
    331  * This differs to lp_build_interleave2 as that function would do the following (for lo):
    332  * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
    333  *
    334  *
    335  * An example interleave 8x float with 8x float on AVX 256bit unpack:
    336  *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
    337  *
    338  * Equivalent to interleaving 2x 128 bit vectors
    339  *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
    340  *
    341  * So interleave-lo would result in:
    342  *   a0 b0 a1 b1 a4 b4 a5 b5
    343  *
    344  * And interleave-hi would result in:
    345  *   a2 b2 a3 b3 a6 b6 a7 b7
    346  */
    347 LLVMValueRef
    348 lp_build_interleave2_half(struct gallivm_state *gallivm,
    349                           struct lp_type type,
    350                           LLVMValueRef a,
    351                           LLVMValueRef b,
    352                           unsigned lo_hi)
    353 {
    354    if (type.length * type.width == 256) {
    355       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
    356       return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
    357    } else {
    358       return lp_build_interleave2(gallivm, type, a, b, lo_hi);
    359    }
    360 }
    361 
    362 
    363 /**
    364  * Double the bit width.
    365  *
    366  * This will only change the number of bits the values are represented, not the
    367  * values themselves.
    368  *
    369  */
    370 void
    371 lp_build_unpack2(struct gallivm_state *gallivm,
    372                  struct lp_type src_type,
    373                  struct lp_type dst_type,
    374                  LLVMValueRef src,
    375                  LLVMValueRef *dst_lo,
    376                  LLVMValueRef *dst_hi)
    377 {
    378    LLVMBuilderRef builder = gallivm->builder;
    379    LLVMValueRef msb;
    380    LLVMTypeRef dst_vec_type;
    381 
    382    assert(!src_type.floating);
    383    assert(!dst_type.floating);
    384    assert(dst_type.width == src_type.width * 2);
    385    assert(dst_type.length * 2 == src_type.length);
    386 
    387    if(dst_type.sign && src_type.sign) {
    388       /* Replicate the sign bit in the most significant bits */
    389       msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
    390    }
    391    else
    392       /* Most significant bits always zero */
    393       msb = lp_build_zero(gallivm, src_type);
    394 
    395    /* Interleave bits */
    396 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    397    *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
    398    *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
    399 
    400 #else
    401    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
    402    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
    403 #endif
    404 
    405    /* Cast the result into the new type (twice as wide) */
    406 
    407    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
    408 
    409    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
    410    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
    411 }
    412 
    413 
    414 /**
    415  * Double the bit width, with an order which fits the cpu nicely.
    416  *
    417  * This will only change the number of bits the values are represented, not the
    418  * values themselves.
    419  *
    420  * The order of the results is not guaranteed, other than it will match
    421  * the corresponding lp_build_pack2_native call.
    422  */
    423 void
    424 lp_build_unpack2_native(struct gallivm_state *gallivm,
    425                         struct lp_type src_type,
    426                         struct lp_type dst_type,
    427                         LLVMValueRef src,
    428                         LLVMValueRef *dst_lo,
    429                         LLVMValueRef *dst_hi)
    430 {
    431    LLVMBuilderRef builder = gallivm->builder;
    432    LLVMValueRef msb;
    433    LLVMTypeRef dst_vec_type;
    434 
    435    assert(!src_type.floating);
    436    assert(!dst_type.floating);
    437    assert(dst_type.width == src_type.width * 2);
    438    assert(dst_type.length * 2 == src_type.length);
    439 
    440    if(dst_type.sign && src_type.sign) {
    441       /* Replicate the sign bit in the most significant bits */
    442       msb = LLVMBuildAShr(builder, src,
    443                lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
    444    }
    445    else
    446       /* Most significant bits always zero */
    447       msb = lp_build_zero(gallivm, src_type);
    448 
    449    /* Interleave bits */
    450 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    451    if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
    452       *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
    453       *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
    454    } else {
    455       *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
    456       *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
    457    }
    458 #else
    459    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
    460    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
    461 #endif
    462 
    463    /* Cast the result into the new type (twice as wide) */
    464 
    465    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
    466 
    467    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
    468    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
    469 }
    470 
    471 
    472 /**
    473  * Expand the bit width.
    474  *
    475  * This will only change the number of bits the values are represented, not the
    476  * values themselves.
    477  */
    478 void
    479 lp_build_unpack(struct gallivm_state *gallivm,
    480                 struct lp_type src_type,
    481                 struct lp_type dst_type,
    482                 LLVMValueRef src,
    483                 LLVMValueRef *dst, unsigned num_dsts)
    484 {
    485    unsigned num_tmps;
    486    unsigned i;
    487 
    488    /* Register width must remain constant */
    489    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
    490 
    491    /* We must not loose or gain channels. Only precision */
    492    assert(src_type.length == dst_type.length * num_dsts);
    493 
    494    num_tmps = 1;
    495    dst[0] = src;
    496 
    497    while(src_type.width < dst_type.width) {
    498       struct lp_type tmp_type = src_type;
    499 
    500       tmp_type.width *= 2;
    501       tmp_type.length /= 2;
    502 
    503       for(i = num_tmps; i--; ) {
    504          lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
    505                           &dst[2*i + 1]);
    506       }
    507 
    508       src_type = tmp_type;
    509 
    510       num_tmps *= 2;
    511    }
    512 
    513    assert(num_tmps == num_dsts);
    514 }
    515 
    516 
    517 /**
    518  * Non-interleaved pack.
    519  *
    520  * This will move values as
    521  *         (LSB)                     (MSB)
    522  *   lo =   l0 __ l1 __ l2 __..  __ ln __
    523  *   hi =   h0 __ h1 __ h2 __..  __ hn __
    524  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
    525  *
    526  * This will only change the number of bits the values are represented, not the
    527  * values themselves.
    528  *
    529  * It is assumed the values are already clamped into the destination type range.
    530  * Values outside that range will produce undefined results. Use
    531  * lp_build_packs2 instead.
    532  */
    533 LLVMValueRef
    534 lp_build_pack2(struct gallivm_state *gallivm,
    535                struct lp_type src_type,
    536                struct lp_type dst_type,
    537                LLVMValueRef lo,
    538                LLVMValueRef hi)
    539 {
    540    LLVMBuilderRef builder = gallivm->builder;
    541    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
    542    LLVMValueRef shuffle;
    543    LLVMValueRef res = NULL;
    544    struct lp_type intr_type = dst_type;
    545 
    546    assert(!src_type.floating);
    547    assert(!dst_type.floating);
    548    assert(src_type.width == dst_type.width * 2);
    549    assert(src_type.length * 2 == dst_type.length);
    550 
    551    /* Check for special cases first */
    552    if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
    553         src_type.width * src_type.length >= 128) {
    554       const char *intrinsic = NULL;
    555       boolean swap_intrinsic_operands = FALSE;
    556 
    557       switch(src_type.width) {
    558       case 32:
    559          if (util_cpu_caps.has_sse2) {
    560            if (dst_type.sign) {
    561               intrinsic = "llvm.x86.sse2.packssdw.128";
    562            } else {
    563               if (util_cpu_caps.has_sse4_1) {
    564                  intrinsic = "llvm.x86.sse41.packusdw";
    565               }
    566            }
    567          } else if (util_cpu_caps.has_altivec) {
    568             if (dst_type.sign) {
    569                intrinsic = "llvm.ppc.altivec.vpkswss";
    570             } else {
    571                intrinsic = "llvm.ppc.altivec.vpkuwus";
    572             }
    573 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    574             swap_intrinsic_operands = TRUE;
    575 #endif
    576          }
    577          break;
    578       case 16:
    579          if (dst_type.sign) {
    580             if (util_cpu_caps.has_sse2) {
    581                intrinsic = "llvm.x86.sse2.packsswb.128";
    582             } else if (util_cpu_caps.has_altivec) {
    583                intrinsic = "llvm.ppc.altivec.vpkshss";
    584 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    585                swap_intrinsic_operands = TRUE;
    586 #endif
    587             }
    588          } else {
    589             if (util_cpu_caps.has_sse2) {
    590                intrinsic = "llvm.x86.sse2.packuswb.128";
    591             } else if (util_cpu_caps.has_altivec) {
    592                intrinsic = "llvm.ppc.altivec.vpkshus";
    593 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    594                swap_intrinsic_operands = TRUE;
    595 #endif
    596             }
    597          }
    598          break;
    599       /* default uses generic shuffle below */
    600       }
    601       if (intrinsic) {
    602          if (src_type.width * src_type.length == 128) {
    603             LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
    604             if (swap_intrinsic_operands) {
    605                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
    606             } else {
    607                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
    608             }
    609             if (dst_vec_type != intr_vec_type) {
    610                res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
    611             }
    612          }
    613          else {
    614             int num_split = src_type.width * src_type.length / 128;
    615             int i;
    616             int nlen = 128 / src_type.width;
    617             int lo_off = swap_intrinsic_operands ? nlen : 0;
    618             int hi_off = swap_intrinsic_operands ? 0 : nlen;
    619             struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
    620             struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
    621             LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
    622             LLVMValueRef tmplo, tmphi;
    623             LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
    624             LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
    625 
    626             assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
    627 
    628             for (i = 0; i < num_split / 2; i++) {
    629                tmplo = lp_build_extract_range(gallivm,
    630                                               lo, i*nlen*2 + lo_off, nlen);
    631                tmphi = lp_build_extract_range(gallivm,
    632                                               lo, i*nlen*2 + hi_off, nlen);
    633                tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
    634                                                      nintr_vec_type, tmplo, tmphi);
    635                if (ndst_vec_type != nintr_vec_type) {
    636                   tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
    637                }
    638             }
    639             for (i = 0; i < num_split / 2; i++) {
    640                tmplo = lp_build_extract_range(gallivm,
    641                                               hi, i*nlen*2 + lo_off, nlen);
    642                tmphi = lp_build_extract_range(gallivm,
    643                                               hi, i*nlen*2 + hi_off, nlen);
    644                tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
    645                                                                  nintr_vec_type,
    646                                                                  tmplo, tmphi);
    647                if (ndst_vec_type != nintr_vec_type) {
    648                   tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
    649                                                            ndst_vec_type, "");
    650                }
    651             }
    652             res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
    653          }
    654          return res;
    655       }
    656    }
    657 
    658    /* generic shuffle */
    659    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
    660    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
    661 
    662    shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
    663 
    664    res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
    665 
    666    return res;
    667 }
    668 
    669 
    670 /**
    671  * Non-interleaved native pack.
    672  *
    673  * Similar to lp_build_pack2, but the ordering of values is not
    674  * guaranteed, other than it will match lp_build_unpack2_native.
    675  *
    676  * In particular, with avx2, the lower and upper 128bits of the vectors will
    677  * be packed independently, so that (with 32bit->16bit values)
    678  *         (LSB)                                       (MSB)
    679  *   lo =   l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
    680  *   hi =   h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
    681  *   res =  l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
    682  *
    683  * This will only change the number of bits the values are represented, not the
    684  * values themselves.
    685  *
    686  * It is assumed the values are already clamped into the destination type range.
    687  * Values outside that range will produce undefined results.
    688  */
    689 LLVMValueRef
    690 lp_build_pack2_native(struct gallivm_state *gallivm,
    691                       struct lp_type src_type,
    692                       struct lp_type dst_type,
    693                       LLVMValueRef lo,
    694                       LLVMValueRef hi)
    695 {
    696    LLVMBuilderRef builder = gallivm->builder;
    697    struct lp_type intr_type = dst_type;
    698    const char *intrinsic = NULL;
    699 
    700    assert(!src_type.floating);
    701    assert(!dst_type.floating);
    702    assert(src_type.width == dst_type.width * 2);
    703    assert(src_type.length * 2 == dst_type.length);
    704 
    705    /* At this point only have special case for avx2 */
    706    if (src_type.length * src_type.width == 256 &&
    707        util_cpu_caps.has_avx2) {
    708       switch(src_type.width) {
    709       case 32:
    710          if (dst_type.sign) {
    711             intrinsic = "llvm.x86.avx2.packssdw";
    712          } else {
    713             intrinsic = "llvm.x86.avx2.packusdw";
    714          }
    715          break;
    716       case 16:
    717          if (dst_type.sign) {
    718             intrinsic = "llvm.x86.avx2.packsswb";
    719          } else {
    720             intrinsic = "llvm.x86.avx2.packuswb";
    721          }
    722          break;
    723       }
    724    }
    725    if (intrinsic) {
    726       LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
    727       return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
    728                                        lo, hi);
    729    }
    730    else {
    731       return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
    732    }
    733 }
    734 
    735 /**
    736  * Non-interleaved pack and saturate.
    737  *
    738  * Same as lp_build_pack2 but will saturate values so that they fit into the
    739  * destination type.
    740  */
    741 LLVMValueRef
    742 lp_build_packs2(struct gallivm_state *gallivm,
    743                 struct lp_type src_type,
    744                 struct lp_type dst_type,
    745                 LLVMValueRef lo,
    746                 LLVMValueRef hi)
    747 {
    748    boolean clamp;
    749 
    750    assert(!src_type.floating);
    751    assert(!dst_type.floating);
    752    assert(src_type.sign == dst_type.sign);
    753    assert(src_type.width == dst_type.width * 2);
    754    assert(src_type.length * 2 == dst_type.length);
    755 
    756    clamp = TRUE;
    757 
    758    /* All X86 SSE non-interleaved pack instructions take signed inputs and
    759     * saturate them, so no need to clamp for those cases. */
    760    if(util_cpu_caps.has_sse2 &&
    761       src_type.width * src_type.length >= 128 &&
    762       src_type.sign &&
    763       (src_type.width == 32 || src_type.width == 16))
    764       clamp = FALSE;
    765 
    766    if(clamp) {
    767       struct lp_build_context bld;
    768       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
    769       LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
    770                                 ((unsigned long long)1 << dst_bits) - 1);
    771       lp_build_context_init(&bld, gallivm, src_type);
    772       lo = lp_build_min(&bld, lo, dst_max);
    773       hi = lp_build_min(&bld, hi, dst_max);
    774       /* FIXME: What about lower bound? */
    775    }
    776 
    777    return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
    778 }
    779 
    780 
    781 /**
    782  * Truncate the bit width.
    783  *
    784  * TODO: Handle saturation consistently.
    785  */
    786 LLVMValueRef
    787 lp_build_pack(struct gallivm_state *gallivm,
    788               struct lp_type src_type,
    789               struct lp_type dst_type,
    790               boolean clamped,
    791               const LLVMValueRef *src, unsigned num_srcs)
    792 {
    793    LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
    794                          struct lp_type src_type,
    795                          struct lp_type dst_type,
    796                          LLVMValueRef lo,
    797                          LLVMValueRef hi);
    798    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    799    unsigned i;
    800 
    801    /* Register width must remain constant */
    802    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
    803 
    804    /* We must not loose or gain channels. Only precision */
    805    assert(src_type.length * num_srcs == dst_type.length);
    806 
    807    if(clamped)
    808       pack2 = &lp_build_pack2;
    809    else
    810       pack2 = &lp_build_packs2;
    811 
    812    for(i = 0; i < num_srcs; ++i)
    813       tmp[i] = src[i];
    814 
    815    while(src_type.width > dst_type.width) {
    816       struct lp_type tmp_type = src_type;
    817 
    818       tmp_type.width /= 2;
    819       tmp_type.length *= 2;
    820 
    821       /* Take in consideration the sign changes only in the last step */
    822       if(tmp_type.width == dst_type.width)
    823          tmp_type.sign = dst_type.sign;
    824 
    825       num_srcs /= 2;
    826 
    827       for(i = 0; i < num_srcs; ++i)
    828          tmp[i] = pack2(gallivm, src_type, tmp_type,
    829                         tmp[2*i + 0], tmp[2*i + 1]);
    830 
    831       src_type = tmp_type;
    832    }
    833 
    834    assert(num_srcs == 1);
    835 
    836    return tmp[0];
    837 }
    838 
    839 
    840 /**
    841  * Truncate or expand the bitwidth.
    842  *
    843  * NOTE: Getting the right sign flags is crucial here, as we employ some
    844  * intrinsics that do saturation.
    845  */
    846 void
    847 lp_build_resize(struct gallivm_state *gallivm,
    848                 struct lp_type src_type,
    849                 struct lp_type dst_type,
    850                 const LLVMValueRef *src, unsigned num_srcs,
    851                 LLVMValueRef *dst, unsigned num_dsts)
    852 {
    853    LLVMBuilderRef builder = gallivm->builder;
    854    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    855    unsigned i;
    856 
    857    /*
    858     * We don't support float <-> int conversion here. That must be done
    859     * before/after calling this function.
    860     */
    861    assert(src_type.floating == dst_type.floating);
    862 
    863    /*
    864     * We don't support double <-> float conversion yet, although it could be
    865     * added with little effort.
    866     */
    867    assert((!src_type.floating && !dst_type.floating) ||
    868           src_type.width == dst_type.width);
    869 
    870    /* We must not loose or gain channels. Only precision */
    871    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
    872 
    873    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
    874    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
    875    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
    876    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
    877 
    878    if (src_type.width > dst_type.width) {
    879       /*
    880        * Truncate bit width.
    881        */
    882 
    883       /* Conversion must be M:1 */
    884       assert(num_dsts == 1);
    885 
    886       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
    887         /*
    888          * Register width remains constant -- use vector packing intrinsics
    889          */
    890          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
    891       }
    892       else {
    893          if (src_type.width / dst_type.width > num_srcs) {
    894             /*
    895             * First change src vectors size (with shuffle) so they have the
    896             * same size as the destination vector, then pack normally.
    897             * Note: cannot use cast/extract because llvm generates atrocious code.
    898             */
    899             unsigned size_ratio = (src_type.width * src_type.length) /
    900                                   (dst_type.length * dst_type.width);
    901             unsigned new_length = src_type.length / size_ratio;
    902 
    903             for (i = 0; i < size_ratio * num_srcs; i++) {
    904                unsigned start_index = (i % size_ratio) * new_length;
    905                tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
    906                                                start_index, new_length);
    907             }
    908             num_srcs *= size_ratio;
    909             src_type.length = new_length;
    910             tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
    911          }
    912          else {
    913             /*
    914              * Truncate bit width but expand vector size - first pack
    915              * then expand simply because this should be more AVX-friendly
    916              * for the cases we probably hit.
    917              */
    918             unsigned size_ratio = (dst_type.width * dst_type.length) /
    919                                   (src_type.length * src_type.width);
    920             unsigned num_pack_srcs = num_srcs / size_ratio;
    921             dst_type.length = dst_type.length / size_ratio;
    922 
    923             for (i = 0; i < size_ratio; i++) {
    924                tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
    925                                       &src[i*num_pack_srcs], num_pack_srcs);
    926             }
    927             tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
    928          }
    929       }
    930    }
    931    else if (src_type.width < dst_type.width) {
    932       /*
    933        * Expand bit width.
    934        */
    935 
    936       /* Conversion must be 1:N */
    937       assert(num_srcs == 1);
    938 
    939       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
    940          /*
    941           * Register width remains constant -- use vector unpack intrinsics
    942           */
    943          lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
    944       }
    945       else {
    946          /*
    947           * Do it element-wise.
    948           */
    949          assert(src_type.length * num_srcs == dst_type.length * num_dsts);
    950 
    951          for (i = 0; i < num_dsts; i++) {
    952             tmp[i] = lp_build_undef(gallivm, dst_type);
    953          }
    954 
    955          for (i = 0; i < src_type.length; ++i) {
    956             unsigned j = i / dst_type.length;
    957             LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
    958             LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
    959             LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
    960 
    961             if (src_type.sign && dst_type.sign) {
    962                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
    963             } else {
    964                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
    965             }
    966             tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
    967          }
    968       }
    969    }
    970    else {
    971       /*
    972        * No-op
    973        */
    974 
    975       /* "Conversion" must be N:N */
    976       assert(num_srcs == num_dsts);
    977 
    978       for(i = 0; i < num_dsts; ++i)
    979          tmp[i] = src[i];
    980    }
    981 
    982    for(i = 0; i < num_dsts; ++i)
    983       dst[i] = tmp[i];
    984 }
    985 
    986 
    987 /**
    988  * Expands src vector from src.length to dst_length
    989  */
    990 LLVMValueRef
    991 lp_build_pad_vector(struct gallivm_state *gallivm,
    992                     LLVMValueRef src,
    993                     unsigned dst_length)
    994 {
    995    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
    996    LLVMValueRef undef;
    997    LLVMTypeRef type;
    998    unsigned i, src_length;
    999 
   1000    type = LLVMTypeOf(src);
   1001 
   1002    if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
   1003       /* Can't use ShuffleVector on non-vector type */
   1004       undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
   1005       return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
   1006    }
   1007 
   1008    undef      = LLVMGetUndef(type);
   1009    src_length = LLVMGetVectorSize(type);
   1010 
   1011    assert(dst_length <= ARRAY_SIZE(elems));
   1012    assert(dst_length >= src_length);
   1013 
   1014    if (src_length == dst_length)
   1015       return src;
   1016 
   1017    /* All elements from src vector */
   1018    for (i = 0; i < src_length; ++i)
   1019       elems[i] = lp_build_const_int32(gallivm, i);
   1020 
   1021    /* Undef fill remaining space */
   1022    for (i = src_length; i < dst_length; ++i)
   1023       elems[i] = lp_build_const_int32(gallivm, src_length);
   1024 
   1025    /* Combine the two vectors */
   1026    return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
   1027 }
   1028