Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 
     29 #include "pipe/p_defines.h"
     30 
     31 #include "util/u_format.h"
     32 #include "util/u_memory.h"
     33 #include "util/u_string.h"
     34 #include "util/u_math.h"
     35 
     36 #include "lp_bld_type.h"
     37 #include "lp_bld_const.h"
     38 #include "lp_bld_conv.h"
     39 #include "lp_bld_swizzle.h"
     40 #include "lp_bld_gather.h"
     41 #include "lp_bld_debug.h"
     42 #include "lp_bld_format.h"
     43 #include "lp_bld_arit.h"
     44 #include "lp_bld_pack.h"
     45 
     46 
     47 static void
     48 convert_to_soa(struct gallivm_state *gallivm,
     49                LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
     50                LLVMValueRef dst_soa[4],
     51                const struct lp_type soa_type)
     52 {
     53    unsigned j, k;
     54    struct lp_type aos_channel_type = soa_type;
     55 
     56    LLVMValueRef aos_channels[4];
     57    unsigned pixels_per_channel = soa_type.length / 4;
     58 
     59    debug_assert((soa_type.length % 4) == 0);
     60 
     61    aos_channel_type.length >>= 1;
     62 
     63    for (j = 0; j < 4; ++j) {
     64       LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
     65 
     66       assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
     67 
     68       for (k = 0; k < pixels_per_channel; ++k) {
     69          channel[k] = src_aos[j + 4 * k];
     70       }
     71 
     72       aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
     73    }
     74 
     75    lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
     76 }
     77 
     78 
     79 void
     80 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
     81                             struct lp_build_context *bld,
     82                             const LLVMValueRef *unswizzled,
     83                             LLVMValueRef swizzled_out[4])
     84 {
     85    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
     86       enum pipe_swizzle swizzle;
     87       LLVMValueRef depth_or_stencil;
     88 
     89       if (util_format_has_stencil(format_desc) &&
     90           !util_format_has_depth(format_desc)) {
     91          assert(!bld->type.floating);
     92          swizzle = format_desc->swizzle[1];
     93       }
     94       else {
     95          assert(bld->type.floating);
     96          swizzle = format_desc->swizzle[0];
     97       }
     98       /*
     99        * Return zzz1 or sss1 for depth-stencil formats here.
    100        * Correct swizzling will be handled by apply_sampler_swizzle() later.
    101        */
    102       depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
    103 
    104       swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
    105       swizzled_out[3] = bld->one;
    106    }
    107    else {
    108       unsigned chan;
    109       for (chan = 0; chan < 4; ++chan) {
    110          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
    111          swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
    112       }
    113    }
    114 }
    115 
    116 
    117 
    118 static LLVMValueRef
    119 lp_build_extract_soa_chan(struct lp_build_context *bld,
    120                           unsigned blockbits,
    121                           boolean srgb_chan,
    122                           struct util_format_channel_description chan_desc,
    123                           LLVMValueRef packed)
    124 {
    125    struct gallivm_state *gallivm = bld->gallivm;
    126    LLVMBuilderRef builder = gallivm->builder;
    127    struct lp_type type = bld->type;
    128    LLVMValueRef input = packed;
    129    const unsigned width = chan_desc.size;
    130    const unsigned start = chan_desc.shift;
    131    const unsigned stop = start + width;
    132 
    133    /* Decode the input vector component */
    134 
    135    switch(chan_desc.type) {
    136    case UTIL_FORMAT_TYPE_VOID:
    137       input = bld->undef;
    138       break;
    139 
    140    case UTIL_FORMAT_TYPE_UNSIGNED:
    141       /*
    142        * Align the LSB
    143        */
    144       if (start) {
    145          input = LLVMBuildLShr(builder, input,
    146                                lp_build_const_int_vec(gallivm, type, start), "");
    147       }
    148 
    149       /*
    150        * Zero the MSBs
    151        */
    152       if (stop < blockbits) {
    153          unsigned mask = ((unsigned long long)1 << width) - 1;
    154          input = LLVMBuildAnd(builder, input,
    155                               lp_build_const_int_vec(gallivm, type, mask), "");
    156       }
    157 
    158       /*
    159        * Type conversion
    160        */
    161       if (type.floating) {
    162          if (srgb_chan) {
    163             struct lp_type conv_type = lp_uint_type(type);
    164             input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
    165          }
    166          else {
    167             if(chan_desc.normalized)
    168                input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
    169             else
    170                input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
    171          }
    172       }
    173       else if (chan_desc.pure_integer) {
    174          /* Nothing to do */
    175       } else {
    176           /* FIXME */
    177           assert(0);
    178       }
    179       break;
    180 
    181    case UTIL_FORMAT_TYPE_SIGNED:
    182       /*
    183        * Align the sign bit first.
    184        */
    185       if (stop < type.width) {
    186          unsigned bits = type.width - stop;
    187          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
    188          input = LLVMBuildShl(builder, input, bits_val, "");
    189       }
    190 
    191       /*
    192        * Align the LSB (with an arithmetic shift to preserve the sign)
    193        */
    194       if (chan_desc.size < type.width) {
    195          unsigned bits = type.width - chan_desc.size;
    196          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
    197          input = LLVMBuildAShr(builder, input, bits_val, "");
    198       }
    199 
    200       /*
    201        * Type conversion
    202        */
    203       if (type.floating) {
    204          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
    205          if (chan_desc.normalized) {
    206             double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
    207             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
    208             input = LLVMBuildFMul(builder, input, scale_val, "");
    209             /*
    210              * The formula above will produce value below -1.0 for most negative
    211              * value but everything seems happy with that hence disable for now.
    212              */
    213             if (0)
    214                input = lp_build_max(bld, input,
    215                                     lp_build_const_vec(gallivm, type, -1.0f));
    216          }
    217       }
    218       else if (chan_desc.pure_integer) {
    219          /* Nothing to do */
    220       } else {
    221           /* FIXME */
    222           assert(0);
    223       }
    224       break;
    225 
    226    case UTIL_FORMAT_TYPE_FLOAT:
    227       if (type.floating) {
    228          if (chan_desc.size == 16) {
    229             struct lp_type f16i_type = type;
    230             f16i_type.width /= 2;
    231             f16i_type.floating = 0;
    232             if (start) {
    233                input = LLVMBuildLShr(builder, input,
    234                                      lp_build_const_int_vec(gallivm, type, start), "");
    235             }
    236             input = LLVMBuildTrunc(builder, input,
    237                                    lp_build_vec_type(gallivm, f16i_type), "");
    238             input = lp_build_half_to_float(gallivm, input);
    239          } else {
    240             assert(start == 0);
    241             assert(stop == 32);
    242             assert(type.width == 32);
    243          }
    244          input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
    245       }
    246       else {
    247          /* FIXME */
    248          assert(0);
    249          input = bld->undef;
    250       }
    251       break;
    252 
    253    case UTIL_FORMAT_TYPE_FIXED:
    254       if (type.floating) {
    255          double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
    256          LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
    257          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
    258          input = LLVMBuildFMul(builder, input, scale_val, "");
    259       }
    260       else {
    261          /* FIXME */
    262          assert(0);
    263          input = bld->undef;
    264       }
    265       break;
    266 
    267    default:
    268       assert(0);
    269       input = bld->undef;
    270       break;
    271    }
    272 
    273    return input;
    274 }
    275 
    276 
    277 /**
    278  * Unpack several pixels in SoA.
    279  *
    280  * It takes a vector of packed pixels:
    281  *
    282  *   packed = {P0, P1, P2, P3, ..., Pn}
    283  *
    284  * And will produce four vectors:
    285  *
    286  *   red    = {R0, R1, R2, R3, ..., Rn}
    287  *   green  = {G0, G1, G2, G3, ..., Gn}
    288  *   blue   = {B0, B1, B2, B3, ..., Bn}
    289  *   alpha  = {A0, A1, A2, A3, ..., An}
    290  *
    291  * It requires that a packed pixel fits into an element of the output
    292  * channels. The common case is when converting pixel with a depth of 32 bit or
    293  * less into floats.
    294  *
    295  * \param format_desc  the format of the 'packed' incoming pixel vector
    296  * \param type  the desired type for rgba_out (type.length = n, above)
    297  * \param packed  the incoming vector of packed pixels
    298  * \param rgba_out  returns the SoA R,G,B,A vectors
    299  */
    300 void
    301 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
    302                          const struct util_format_description *format_desc,
    303                          struct lp_type type,
    304                          LLVMValueRef packed,
    305                          LLVMValueRef rgba_out[4])
    306 {
    307    struct lp_build_context bld;
    308    LLVMValueRef inputs[4];
    309    unsigned chan;
    310 
    311    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    312    assert(format_desc->block.width == 1);
    313    assert(format_desc->block.height == 1);
    314    assert(format_desc->block.bits <= type.width);
    315    /* FIXME: Support more output types */
    316    assert(type.width == 32);
    317 
    318    lp_build_context_init(&bld, gallivm, type);
    319 
    320    /* Decode the input vector components */
    321    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
    322       struct util_format_channel_description chan_desc = format_desc->channel[chan];
    323       boolean srgb_chan = FALSE;
    324 
    325       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
    326           format_desc->swizzle[3] != chan) {
    327          srgb_chan = TRUE;
    328       }
    329 
    330       inputs[chan] = lp_build_extract_soa_chan(&bld,
    331                                                format_desc->block.bits,
    332                                                srgb_chan,
    333                                                chan_desc,
    334                                                packed);
    335    }
    336 
    337    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
    338 }
    339 
    340 
    341 /**
    342  * Convert a vector of rgba8 values into 32bit wide SoA vectors.
    343  *
    344  * \param dst_type  The desired return type. For pure integer formats
    345  *                  this should be a 32bit wide int or uint vector type,
    346  *                  otherwise a float vector type.
    347  *
    348  * \param packed    The rgba8 values to pack.
    349  *
    350  * \param rgba      The 4 SoA return vectors.
    351  */
    352 void
    353 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
    354                            struct lp_type dst_type,
    355                            LLVMValueRef packed,
    356                            LLVMValueRef *rgba)
    357 {
    358    LLVMBuilderRef builder = gallivm->builder;
    359    LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
    360    unsigned chan;
    361 
    362    /* XXX technically shouldn't use that for uint dst_type */
    363    packed = LLVMBuildBitCast(builder, packed,
    364                              lp_build_int_vec_type(gallivm, dst_type), "");
    365 
    366    /* Decode the input vector components */
    367    for (chan = 0; chan < 4; ++chan) {
    368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    369       unsigned start = chan*8;
    370 #else
    371       unsigned start = (3-chan)*8;
    372 #endif
    373       unsigned stop = start + 8;
    374       LLVMValueRef input;
    375 
    376       input = packed;
    377 
    378       if (start)
    379          input = LLVMBuildLShr(builder, input,
    380                                lp_build_const_int_vec(gallivm, dst_type, start), "");
    381 
    382       if (stop < 32)
    383          input = LLVMBuildAnd(builder, input, mask, "");
    384 
    385       if (dst_type.floating)
    386          input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
    387 
    388       rgba[chan] = input;
    389    }
    390 }
    391 
    392 
    393 
    394 /**
    395  * Fetch a texels from a texture, returning them in SoA layout.
    396  *
    397  * \param type  the desired return type for 'rgba'.  The vector length
    398  *              is the number of texels to fetch
    399  * \param aligned if the offset is guaranteed to be aligned to element width
    400  *
    401  * \param base_ptr  points to the base of the texture mip tree.
    402  * \param offset    offset to start of the texture image block.  For non-
    403  *                  compressed formats, this simply is an offset to the texel.
    404  *                  For compressed formats, it is an offset to the start of the
    405  *                  compressed data block.
    406  *
    407  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
    408  *              these will always be (0,0).  For compressed formats, i will
    409  *              be in [0, block_width-1] and j will be in [0, block_height-1].
    410  * \param cache  optional value pointing to a lp_build_format_cache structure
    411  */
    412 void
    413 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
    414                         const struct util_format_description *format_desc,
    415                         struct lp_type type,
    416                         boolean aligned,
    417                         LLVMValueRef base_ptr,
    418                         LLVMValueRef offset,
    419                         LLVMValueRef i,
    420                         LLVMValueRef j,
    421                         LLVMValueRef cache,
    422                         LLVMValueRef rgba_out[4])
    423 {
    424    LLVMBuilderRef builder = gallivm->builder;
    425    enum pipe_format format = format_desc->format;
    426    struct lp_type fetch_type;
    427 
    428    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
    429        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
    430         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
    431         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
    432        format_desc->block.width == 1 &&
    433        format_desc->block.height == 1 &&
    434        format_desc->block.bits <= type.width &&
    435        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
    436         format_desc->channel[0].size == 32 ||
    437         format_desc->channel[0].size == 16))
    438    {
    439       /*
    440        * The packed pixel fits into an element of the destination format. Put
    441        * the packed pixels into a vector and extract each component for all
    442        * vector elements in parallel.
    443        */
    444 
    445       LLVMValueRef packed;
    446 
    447       /*
    448        * gather the texels from the texture
    449        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
    450        */
    451       assert(format_desc->block.bits <= type.width);
    452       fetch_type = lp_type_uint(type.width);
    453       packed = lp_build_gather(gallivm,
    454                                type.length,
    455                                format_desc->block.bits,
    456                                fetch_type,
    457                                aligned,
    458                                base_ptr, offset, FALSE);
    459 
    460       /*
    461        * convert texels to float rgba
    462        */
    463       lp_build_unpack_rgba_soa(gallivm,
    464                                format_desc,
    465                                type,
    466                                packed, rgba_out);
    467       return;
    468    }
    469 
    470 
    471    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
    472        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
    473        format_desc->block.width == 1 &&
    474        format_desc->block.height == 1 &&
    475        format_desc->block.bits > type.width &&
    476        ((format_desc->block.bits <= type.width * type.length &&
    477          format_desc->channel[0].size <= type.width) ||
    478         (format_desc->channel[0].size == 64 &&
    479          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
    480          type.floating)))
    481    {
    482       /*
    483        * Similar to above, but the packed pixel is larger than what fits
    484        * into an element of the destination format. The packed pixels will be
    485        * shuffled into SoA vectors appropriately, and then the extraction will
    486        * be done in parallel as much as possible.
    487        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
    488        * the gathered vectors can be shuffled easily (even with avx).
    489        * 64xn float -> 32xn float is handled too but it's a bit special as
    490        * it does the conversion pre-shuffle.
    491        */
    492 
    493       LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
    494       struct lp_type fetch_type, gather_type = type;
    495       unsigned num_gather, fetch_width, i, j;
    496       struct lp_build_context bld;
    497       boolean fp64 = format_desc->channel[0].size == 64;
    498 
    499       lp_build_context_init(&bld, gallivm, type);
    500 
    501       assert(type.width == 32);
    502       assert(format_desc->block.bits > type.width);
    503 
    504       /*
    505        * First, figure out fetch order.
    506        */
    507       fetch_width = util_next_power_of_two(format_desc->block.bits);
    508       /*
    509        * fp64 are treated like fp32 except we fetch twice wide values
    510        * (as we shuffle after trunc). The shuffles for that work out
    511        * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
    512        * albeit we miss the potential opportunity for hw gather (as it
    513        * only handles native size).
    514        */
    515       num_gather = fetch_width / type.width;
    516       gather_type.width *= num_gather;
    517       if (fp64) {
    518          num_gather /= 2;
    519       }
    520       gather_type.length /= num_gather;
    521 
    522       for (i = 0; i < num_gather; i++) {
    523          LLVMValueRef offsetr, shuf_vec;
    524          if(num_gather == 4) {
    525             for (j = 0; j < gather_type.length; j++) {
    526                unsigned idx = i + 4*j;
    527                shuffles[j] = lp_build_const_int32(gallivm, idx);
    528             }
    529             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
    530             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
    531 
    532          }
    533          else if (num_gather == 2) {
    534             assert(num_gather == 2);
    535             for (j = 0; j < gather_type.length; j++) {
    536                unsigned idx = i*2 + (j%2) + (j/2)*4;
    537                shuffles[j] = lp_build_const_int32(gallivm, idx);
    538             }
    539             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
    540             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
    541          }
    542          else {
    543             assert(num_gather == 1);
    544             offsetr = offset;
    545          }
    546          if (gather_type.length == 1) {
    547             LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
    548             offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
    549          }
    550 
    551          /*
    552           * Determine whether to use float or int loads. This is mostly
    553           * to outsmart the (stupid) llvm int/float shuffle logic, we
    554           * don't really care much if the data is floats or ints...
    555           * But llvm will refuse to use single float shuffle with int data
    556           * and instead use 3 int shuffles instead, the code looks atrocious.
    557           * (Note bitcasts often won't help, as llvm is too smart to be
    558           * fooled by that.)
    559           * Nobody cares about simd float<->int domain transition penalties,
    560           * which usually don't even exist for shuffles anyway.
    561           * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
    562           * going into transpose, which is unpacks, so doesn't really matter
    563           * much).
    564           * With 2x32bit or 4x16bit fetch, we use float vec, since those
    565           * go into the weird channel separation shuffle. With floats,
    566           * this is (with 128bit vectors):
    567           * - 2 movq, 2 movhpd, 2 shufps
    568           * With ints it would be:
    569           * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
    570           * I've seen texture functions increase in code size by 15% just due
    571           * to that (there's lots of such fetches in them...)
    572           * (We could chose a different gather order to improve this somewhat
    573           * for the int path, but it would basically just drop the blends,
    574           * so the float path with this order really is optimal.)
    575           * Albeit it is tricky sometimes llvm doesn't ignore the float->int
    576           * casts so must avoid them until we're done with the float shuffle...
    577           * 3x16bit formats (the same is also true for 3x8) are pretty bad but
    578           * there's nothing we can do about them (we could overallocate by
    579           * those couple bytes and use unaligned but pot sized load).
    580           * Note that this is very much x86 specific. I don't know if this
    581           * affect other archs at all.
    582           */
    583          if (num_gather > 1) {
    584             /*
    585              * We always want some float type here (with x86)
    586              * due to shuffles being float ones afterwards (albeit for
    587              * the num_gather == 4 case int should work fine too
    588              * (unless there's some problems with avx but not avx2).
    589              */
    590             if (format_desc->channel[0].size == 64) {
    591                fetch_type = lp_type_float_vec(64, gather_type.width);
    592             } else {
    593                fetch_type = lp_type_int_vec(32, gather_type.width);
    594             }
    595          }
    596          else {
    597             /* type doesn't matter much */
    598             if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
    599                 (format_desc->channel[0].size == 32 ||
    600                  format_desc->channel[0].size == 64)) {
    601             fetch_type = lp_type_float(gather_type.width);
    602             } else {
    603                fetch_type = lp_type_uint(gather_type.width);
    604             }
    605          }
    606 
    607          /* Now finally gather the values */
    608          packed[i] = lp_build_gather(gallivm, gather_type.length,
    609                                      format_desc->block.bits,
    610                                      fetch_type, aligned,
    611                                      base_ptr, offsetr, FALSE);
    612          if (fp64) {
    613             struct lp_type conv_type = type;
    614             conv_type.width *= 2;
    615             packed[i] = LLVMBuildBitCast(builder, packed[i],
    616                                          lp_build_vec_type(gallivm, conv_type), "");
    617             packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
    618          }
    619       }
    620 
    621       /* shuffle the gathered values to SoA */
    622       if (num_gather == 2) {
    623          for (i = 0; i < num_gather; i++) {
    624             for (j = 0; j < type.length; j++) {
    625                unsigned idx = (j%2)*2 + (j/4)*4 + i;
    626                if ((j/2)%2)
    627                   idx += type.length;
    628                shuffles[j] = lp_build_const_int32(gallivm, idx);
    629             }
    630             dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
    631                                             LLVMConstVector(shuffles, type.length), "");
    632          }
    633       }
    634       else if (num_gather == 4) {
    635          lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
    636       }
    637       else {
    638          assert(num_gather == 1);
    639          dst[0] = packed[0];
    640       }
    641 
    642       /*
    643        * And finally unpack exactly as above, except that
    644        * chan shift is adjusted and the right vector selected.
    645        */
    646       if (!fp64) {
    647          for (i = 0; i < num_gather; i++) {
    648             dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
    649          }
    650          for (i = 0; i < format_desc->nr_channels; i++) {
    651             struct util_format_channel_description chan_desc = format_desc->channel[i];
    652             unsigned blockbits = type.width;
    653             unsigned vec_nr;
    654 
    655 #ifdef PIPE_ARCH_BIG_ENDIAN
    656             vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
    657 #else
    658             vec_nr = chan_desc.shift / type.width;
    659 #endif
    660             chan_desc.shift %= type.width;
    661 
    662             output[i] = lp_build_extract_soa_chan(&bld,
    663                                                   blockbits,
    664                                                   FALSE,
    665                                                   chan_desc,
    666                                                   dst[vec_nr]);
    667          }
    668       }
    669       else {
    670          for (i = 0; i < format_desc->nr_channels; i++)  {
    671             output[i] = dst[i];
    672          }
    673       }
    674 
    675       lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
    676       return;
    677    }
    678 
    679    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
    680        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
    681       /*
    682        * similar conceptually to above but requiring special
    683        * AoS packed -> SoA float conversion code.
    684        */
    685       LLVMValueRef packed;
    686       struct lp_type fetch_type = lp_type_uint(type.width);
    687 
    688       assert(type.floating);
    689       assert(type.width == 32);
    690 
    691       packed = lp_build_gather(gallivm, type.length,
    692                                format_desc->block.bits,
    693                                fetch_type, aligned,
    694                                base_ptr, offset, FALSE);
    695       if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
    696          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
    697       }
    698       else {
    699          lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
    700       }
    701       return;
    702    }
    703 
    704    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
    705        format_desc->block.bits == 64) {
    706       /*
    707        * special case the format is 64 bits but we only require
    708        * 32bit (or 8bit) from each block.
    709        */
    710       LLVMValueRef packed;
    711       struct lp_type fetch_type = lp_type_uint(type.width);
    712 
    713       if (format == PIPE_FORMAT_X32_S8X24_UINT) {
    714          /*
    715           * for stencil simply fix up offsets - could in fact change
    716           * base_ptr instead even outside the shader.
    717           */
    718          unsigned mask = (1 << 8) - 1;
    719          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
    720          offset = LLVMBuildAdd(builder, offset, s_offset, "");
    721          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
    722                                   aligned, base_ptr, offset, FALSE);
    723          packed = LLVMBuildAnd(builder, packed,
    724                                lp_build_const_int_vec(gallivm, type, mask), "");
    725       }
    726       else {
    727          assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
    728          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
    729                                   aligned, base_ptr, offset, TRUE);
    730          packed = LLVMBuildBitCast(builder, packed,
    731                                    lp_build_vec_type(gallivm, type), "");
    732       }
    733       /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
    734       rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
    735       rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
    736       return;
    737    }
    738 
    739    /*
    740     * Try calling lp_build_fetch_rgba_aos for all pixels.
    741     * Should only really hit subsampled, compressed
    742     * (for s3tc srgb too, for rgtc the unorm ones only) by now.
    743     * (This is invalid for plain 8unorm formats because we're lazy with
    744     * the swizzle since some results would arrive swizzled, some not.)
    745     */
    746 
    747    if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
    748        (util_format_fits_8unorm(format_desc) ||
    749         format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
    750        type.floating && type.width == 32 &&
    751        (type.length == 1 || (type.length % 4 == 0))) {
    752       struct lp_type tmp_type;
    753       struct lp_build_context bld;
    754       LLVMValueRef packed, rgba[4];
    755       const struct util_format_description *flinear_desc;
    756       const struct util_format_description *frgba8_desc;
    757       unsigned chan;
    758 
    759       lp_build_context_init(&bld, gallivm, type);
    760 
    761       /*
    762        * Make sure the conversion in aos really only does convert to rgba8
    763        * and not anything more (so use linear format, adjust type).
    764        */
    765       flinear_desc = util_format_description(util_format_linear(format));
    766       memset(&tmp_type, 0, sizeof tmp_type);
    767       tmp_type.width = 8;
    768       tmp_type.length = type.length * 4;
    769       tmp_type.norm = TRUE;
    770 
    771       packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
    772                                        aligned, base_ptr, offset, i, j, cache);
    773       packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
    774 
    775       /*
    776        * The values are now packed so they match ordinary (srgb) RGBA8 format,
    777        * hence need to use matching format for unpack.
    778        */
    779       frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
    780       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
    781          assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
    782          frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
    783       }
    784       lp_build_unpack_rgba_soa(gallivm,
    785                                frgba8_desc,
    786                                type,
    787                                packed, rgba);
    788 
    789       /*
    790        * We converted 4 channels. Make sure llvm can drop unneeded ones
    791        * (luckily the rgba order is fixed, only LA needs special case).
    792        */
    793       for (chan = 0; chan < 4; chan++) {
    794          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
    795          if (chan == 3 && util_format_is_luminance_alpha(format)) {
    796             swizzle = PIPE_SWIZZLE_W;
    797          }
    798          rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
    799       }
    800       return;
    801    }
    802 
    803 
    804    /*
    805     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
    806     *
    807     * This is not the most efficient way of fetching pixels, as we
    808     * miss some opportunities to do vectorization, but this is
    809     * convenient for formats or scenarios for which there was no
    810     * opportunity or incentive to optimize.
    811     *
    812     * We do NOT want to end up here, this typically is quite terrible,
    813     * in particular if the formats have less than 4 channels.
    814     *
    815     * Right now, this should only be hit for:
    816     * - RGTC snorm formats
    817     *   (those miss fast fetch functions hence they are terrible anyway)
    818     */
    819 
    820    {
    821       unsigned k;
    822       struct lp_type tmp_type;
    823       LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
    824 
    825       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
    826          debug_printf("%s: AoS fetch fallback for %s\n",
    827                       __FUNCTION__, format_desc->short_name);
    828       }
    829 
    830       tmp_type = type;
    831       tmp_type.length = 4;
    832 
    833       /*
    834        * Note that vector transpose can be worse compared to insert/extract
    835        * for aos->soa conversion (for formats with 1 or 2 channels). However,
    836        * we should try to avoid getting here for just about all formats, so
    837        * don't bother.
    838        */
    839 
    840       /* loop over number of pixels */
    841       for(k = 0; k < type.length; ++k) {
    842          LLVMValueRef index = lp_build_const_int32(gallivm, k);
    843          LLVMValueRef offset_elem;
    844          LLVMValueRef i_elem, j_elem;
    845 
    846          offset_elem = LLVMBuildExtractElement(builder, offset,
    847                                                index, "");
    848 
    849          i_elem = LLVMBuildExtractElement(builder, i, index, "");
    850          j_elem = LLVMBuildExtractElement(builder, j, index, "");
    851 
    852          /* Get a single float[4]={R,G,B,A} pixel */
    853          aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
    854                                                 aligned, base_ptr, offset_elem,
    855                                                 i_elem, j_elem, cache);
    856 
    857       }
    858       convert_to_soa(gallivm, aos_fetch, rgba_out, type);
    859    }
    860 }
    861