Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 /**
     29  * @file
     30  * AoS pixel format manipulation.
     31  *
     32  * @author Jose Fonseca <jfonseca (at) vmware.com>
     33  */
     34 
     35 
     36 #include "util/u_format.h"
     37 #include "util/u_memory.h"
     38 #include "util/u_math.h"
     39 #include "util/u_pointer.h"
     40 #include "util/u_string.h"
     41 #include "util/u_cpu_detect.h"
     42 
     43 #include "lp_bld_arit.h"
     44 #include "lp_bld_init.h"
     45 #include "lp_bld_type.h"
     46 #include "lp_bld_flow.h"
     47 #include "lp_bld_const.h"
     48 #include "lp_bld_conv.h"
     49 #include "lp_bld_swizzle.h"
     50 #include "lp_bld_gather.h"
     51 #include "lp_bld_debug.h"
     52 #include "lp_bld_format.h"
     53 #include "lp_bld_pack.h"
     54 #include "lp_bld_intr.h"
     55 #include "lp_bld_logic.h"
     56 #include "lp_bld_bitarit.h"
     57 
     58 
     59 /**
     60  * Basic swizzling.  Rearrange the order of the unswizzled array elements
     61  * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
     62  * too.
     63  * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
     64  */
     65 LLVMValueRef
     66 lp_build_format_swizzle_aos(const struct util_format_description *desc,
     67                             struct lp_build_context *bld,
     68                             LLVMValueRef unswizzled)
     69 {
     70    unsigned char swizzles[4];
     71    unsigned chan;
     72 
     73    assert(bld->type.length % 4 == 0);
     74 
     75    for (chan = 0; chan < 4; ++chan) {
     76       enum pipe_swizzle swizzle;
     77 
     78       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
     79          /*
     80           * For ZS formats do RGBA = ZZZ1
     81           */
     82          if (chan == 3) {
     83             swizzle = PIPE_SWIZZLE_1;
     84          } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
     85             swizzle = PIPE_SWIZZLE_0;
     86          } else {
     87             swizzle = desc->swizzle[0];
     88          }
     89       } else {
     90          swizzle = desc->swizzle[chan];
     91       }
     92       swizzles[chan] = swizzle;
     93    }
     94 
     95    return lp_build_swizzle_aos(bld, unswizzled, swizzles);
     96 }
     97 
     98 
     99 /**
    100  * Whether the format matches the vector type, apart of swizzles.
    101  */
    102 static inline boolean
    103 format_matches_type(const struct util_format_description *desc,
    104                     struct lp_type type)
    105 {
    106    enum util_format_type chan_type;
    107    unsigned chan;
    108 
    109    assert(type.length % 4 == 0);
    110 
    111    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
    112        desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
    113        desc->block.width != 1 ||
    114        desc->block.height != 1) {
    115       return FALSE;
    116    }
    117 
    118    if (type.floating) {
    119       chan_type = UTIL_FORMAT_TYPE_FLOAT;
    120    } else if (type.fixed) {
    121       chan_type = UTIL_FORMAT_TYPE_FIXED;
    122    } else if (type.sign) {
    123       chan_type = UTIL_FORMAT_TYPE_SIGNED;
    124    } else {
    125       chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
    126    }
    127 
    128    for (chan = 0; chan < desc->nr_channels; ++chan) {
    129       if (desc->channel[chan].size != type.width) {
    130          return FALSE;
    131       }
    132 
    133       if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
    134          if (desc->channel[chan].type != chan_type ||
    135              desc->channel[chan].normalized != type.norm) {
    136             return FALSE;
    137          }
    138       }
    139    }
    140 
    141    return TRUE;
    142 }
    143 
    144 /*
    145  * Do rounding when converting small unorm values to larger ones.
    146  * Not quite 100% accurate, as it's done by appending MSBs, but
    147  * should be good enough.
    148  */
    149 
    150 static inline LLVMValueRef
    151 scale_bits_up(struct gallivm_state *gallivm,
    152               int src_bits,
    153               int dst_bits,
    154               LLVMValueRef src,
    155               struct lp_type src_type)
    156 {
    157    LLVMBuilderRef builder = gallivm->builder;
    158    LLVMValueRef result = src;
    159 
    160    if (src_bits == 1 && dst_bits > 1) {
    161       /*
    162        * Useful for a1 - we'd need quite some repeated copies otherwise.
    163        */
    164       struct lp_build_context bld;
    165       LLVMValueRef dst_mask;
    166       lp_build_context_init(&bld, gallivm, src_type);
    167       dst_mask = lp_build_const_int_vec(gallivm, src_type,
    168                                         (1 << dst_bits) - 1),
    169       result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
    170                             lp_build_const_int_vec(gallivm, src_type, 0));
    171       result = lp_build_andnot(&bld, dst_mask, result);
    172    }
    173    else if (dst_bits > src_bits) {
    174       /* Scale up bits */
    175       int db = dst_bits - src_bits;
    176 
    177       /* Shift left by difference in bits */
    178       result = LLVMBuildShl(builder,
    179                             src,
    180                             lp_build_const_int_vec(gallivm, src_type, db),
    181                             "");
    182 
    183       if (db <= src_bits) {
    184          /* Enough bits in src to fill the remainder */
    185          LLVMValueRef lower = LLVMBuildLShr(builder,
    186                                             src,
    187                                             lp_build_const_int_vec(gallivm, src_type,
    188                                                                    src_bits - db),
    189                                             "");
    190 
    191          result = LLVMBuildOr(builder, result, lower, "");
    192       } else if (db > src_bits) {
    193          /* Need to repeatedly copy src bits to fill remainder in dst */
    194          unsigned n;
    195 
    196          for (n = src_bits; n < dst_bits; n *= 2) {
    197             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
    198 
    199             result = LLVMBuildOr(builder,
    200                                  result,
    201                                  LLVMBuildLShr(builder, result, shuv, ""),
    202                                  "");
    203          }
    204       }
    205    } else {
    206       assert (dst_bits == src_bits);
    207    }
    208 
    209    return result;
    210 }
    211 
    212 /**
    213  * Unpack a single pixel into its XYZW components.
    214  *
    215  * @param desc  the pixel format for the packed pixel value
    216  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
    217  *
    218  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
    219  */
    220 static inline LLVMValueRef
    221 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    222                                const struct util_format_description *desc,
    223                                LLVMValueRef packed)
    224 {
    225    LLVMBuilderRef builder = gallivm->builder;
    226    LLVMValueRef shifted, casted, scaled, masked;
    227    LLVMValueRef shifts[4];
    228    LLVMValueRef masks[4];
    229    LLVMValueRef scales[4];
    230    LLVMTypeRef vec32_type;
    231 
    232    boolean normalized;
    233    boolean needs_uitofp;
    234    unsigned i;
    235 
    236    /* TODO: Support more formats */
    237    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    238    assert(desc->block.width == 1);
    239    assert(desc->block.height == 1);
    240    assert(desc->block.bits <= 32);
    241 
    242    /* Do the intermediate integer computations with 32bit integers since it
    243     * matches floating point size */
    244    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
    245 
    246    vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
    247 
    248    /* Broadcast the packed value to all four channels
    249     * before: packed = BGRA
    250     * after: packed = {BGRA, BGRA, BGRA, BGRA}
    251     */
    252    packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
    253                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
    254                                    "");
    255    packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
    256                                    LLVMConstNull(vec32_type),
    257                                    "");
    258 
    259    /* Initialize vector constants */
    260    normalized = FALSE;
    261    needs_uitofp = FALSE;
    262 
    263    /* Loop over 4 color components */
    264    for (i = 0; i < 4; ++i) {
    265       unsigned bits = desc->channel[i].size;
    266       unsigned shift = desc->channel[i].shift;
    267 
    268       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
    269          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    270          masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
    271          scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
    272       }
    273       else {
    274          unsigned long long mask = (1ULL << bits) - 1;
    275 
    276          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
    277 
    278          if (bits == 32) {
    279             needs_uitofp = TRUE;
    280          }
    281 
    282          shifts[i] = lp_build_const_int32(gallivm, shift);
    283          masks[i] = lp_build_const_int32(gallivm, mask);
    284 
    285          if (desc->channel[i].normalized) {
    286             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
    287             normalized = TRUE;
    288          }
    289          else
    290             scales[i] =  lp_build_const_float(gallivm, 1.0);
    291       }
    292    }
    293 
    294    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
    295     * into masked = {X, Y, Z, W}
    296     */
    297    if (desc->block.bits < 32 && normalized) {
    298       /*
    299        * Note: we cannot do the shift below on x86 natively until AVX2.
    300        *
    301        * Old llvm versions will resort to scalar extract/shift insert,
    302        * which is definitely terrible, new versions will just do
    303        * several vector shifts and shuffle/blend results together.
    304        * We could turn this into a variable left shift plus a constant
    305        * right shift, and llvm would then turn the variable left shift
    306        * into a mul for us (albeit without sse41 the mul needs emulation
    307        * too...). However, since we're going to do a float mul
    308        * anyway, we just adjust that mul instead (plus the mask), skipping
    309        * the shift completely.
    310        * We could also use a extra mul when the format isn't normalized and
    311        * we don't have AVX2 support, but don't bother for now. Unfortunately,
    312        * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
    313        * rgba8 if it ends up here), as that would require UIToFP, albeit that
    314        * would be fixable with easy 16bit shuffle (unless there's channels
    315        * crossing 16bit boundaries).
    316        */
    317       for (i = 0; i < 4; ++i) {
    318          if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
    319             unsigned bits = desc->channel[i].size;
    320             unsigned shift = desc->channel[i].shift;
    321             unsigned long long mask = ((1ULL << bits) - 1) << shift;
    322             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
    323             masks[i] = lp_build_const_int32(gallivm, mask);
    324          }
    325       }
    326       masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
    327    } else {
    328       shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
    329       masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
    330    }
    331 
    332    if (!needs_uitofp) {
    333       /* UIToFP can't be expressed in SSE2 */
    334       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
    335    } else {
    336       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
    337    }
    338 
    339    /*
    340     * At this point 'casted' may be a vector of floats such as
    341     * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
    342     * by powers of two). Next, if the pixel values are normalized
    343     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
    344     */
    345 
    346    if (normalized)
    347       scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
    348    else
    349       scaled = casted;
    350 
    351    return scaled;
    352 }
    353 
    354 
    355 /**
    356  * Pack a single pixel.
    357  *
    358  * @param rgba 4 float vector with the unpacked components.
    359  *
    360  * XXX: This is mostly for reference and testing -- operating a single pixel at
    361  * a time is rarely if ever needed.
    362  */
    363 LLVMValueRef
    364 lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
    365                        const struct util_format_description *desc,
    366                        LLVMValueRef rgba)
    367 {
    368    LLVMBuilderRef builder = gallivm->builder;
    369    LLVMTypeRef type;
    370    LLVMValueRef packed = NULL;
    371    LLVMValueRef swizzles[4];
    372    LLVMValueRef shifted, casted, scaled, unswizzled;
    373    LLVMValueRef shifts[4];
    374    LLVMValueRef scales[4];
    375    boolean normalized;
    376    unsigned i, j;
    377 
    378    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    379    assert(desc->block.width == 1);
    380    assert(desc->block.height == 1);
    381 
    382    type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
    383 
    384    /* Unswizzle the color components into the source vector. */
    385    for (i = 0; i < 4; ++i) {
    386       for (j = 0; j < 4; ++j) {
    387          if (desc->swizzle[j] == i)
    388             break;
    389       }
    390       if (j < 4)
    391          swizzles[i] = lp_build_const_int32(gallivm, j);
    392       else
    393          swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    394    }
    395 
    396    unswizzled = LLVMBuildShuffleVector(builder, rgba,
    397                                        LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
    398                                        LLVMConstVector(swizzles, 4), "");
    399 
    400    normalized = FALSE;
    401    for (i = 0; i < 4; ++i) {
    402       unsigned bits = desc->channel[i].size;
    403       unsigned shift = desc->channel[i].shift;
    404 
    405       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
    406          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    407          scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
    408       }
    409       else {
    410          unsigned mask = (1 << bits) - 1;
    411 
    412          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
    413          assert(bits < 32);
    414 
    415          shifts[i] = lp_build_const_int32(gallivm, shift);
    416 
    417          if (desc->channel[i].normalized) {
    418             scales[i] = lp_build_const_float(gallivm, mask);
    419             normalized = TRUE;
    420          }
    421          else
    422             scales[i] = lp_build_const_float(gallivm, 1.0);
    423       }
    424    }
    425 
    426    if (normalized)
    427       scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
    428    else
    429       scaled = unswizzled;
    430 
    431    casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
    432 
    433    shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
    434 
    435    /* Bitwise or all components */
    436    for (i = 0; i < 4; ++i) {
    437       if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
    438          LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
    439                                                lp_build_const_int32(gallivm, i), "");
    440          if (packed)
    441             packed = LLVMBuildOr(builder, packed, component, "");
    442          else
    443             packed = component;
    444       }
    445    }
    446 
    447    if (!packed)
    448       packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    449 
    450    if (desc->block.bits < 32)
    451       packed = LLVMBuildTrunc(builder, packed, type, "");
    452 
    453    return packed;
    454 }
    455 
    456 
    457 
    458 
    459 /**
    460  * Fetch a pixel into a 4 float AoS.
    461  *
    462  * \param format_desc  describes format of the image we're fetching from
    463  * \param aligned  whether the data is guaranteed to be aligned
    464  * \param ptr  address of the pixel block (or the texel if uncompressed)
    465  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
    466  *              these will always be (0, 0).
    467  * \return  a 4 element vector with the pixel's RGBA values.
    468  */
    469 LLVMValueRef
    470 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
    471                         const struct util_format_description *format_desc,
    472                         struct lp_type type,
    473                         boolean aligned,
    474                         LLVMValueRef base_ptr,
    475                         LLVMValueRef offset,
    476                         LLVMValueRef i,
    477                         LLVMValueRef j,
    478                         LLVMValueRef cache)
    479 {
    480    LLVMBuilderRef builder = gallivm->builder;
    481    unsigned num_pixels = type.length / 4;
    482    struct lp_build_context bld;
    483 
    484    assert(type.length <= LP_MAX_VECTOR_LENGTH);
    485    assert(type.length % 4 == 0);
    486 
    487    lp_build_context_init(&bld, gallivm, type);
    488 
    489    /*
    490     * Trivial case
    491     *
    492     * The format matches the type (apart of a swizzle) so no need for
    493     * scaling or converting.
    494     */
    495 
    496    if (format_matches_type(format_desc, type) &&
    497        format_desc->block.bits <= type.width * 4 &&
    498        /* XXX this shouldn't be needed */
    499        util_is_power_of_two(format_desc->block.bits)) {
    500       LLVMValueRef packed;
    501       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
    502       struct lp_type fetch_type;
    503       unsigned vec_len = type.width * type.length;
    504 
    505       /*
    506        * The format matches the type (apart of a swizzle) so no need for
    507        * scaling or converting.
    508        */
    509 
    510       fetch_type = lp_type_uint(type.width*4);
    511       packed = lp_build_gather(gallivm, type.length/4,
    512                                format_desc->block.bits, fetch_type,
    513                                aligned, base_ptr, offset, TRUE);
    514 
    515       assert(format_desc->block.bits <= vec_len);
    516       (void) vec_len; /* silence unused var warning for non-debug build */
    517 
    518       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
    519       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
    520    }
    521 
    522    /*
    523     * Bit arithmetic for converting small_unorm to unorm8.
    524     *
    525     * This misses some opportunities for optimizations (like skipping mask
    526     * for the highest channel for instance, or doing bit scaling in parallel
    527     * for channels with the same bit width) but it should be passable for
    528     * all arithmetic formats.
    529     */
    530    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
    531        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
    532        util_format_fits_8unorm(format_desc) &&
    533        type.width == 8 && type.norm == 1 && type.sign == 0 &&
    534        type.fixed == 0 && type.floating == 0) {
    535       LLVMValueRef packed, res, chans[4], rgba[4];
    536       LLVMTypeRef dst_vec_type, conv_vec_type;
    537       struct lp_type fetch_type, conv_type;
    538       struct lp_build_context bld_conv;
    539       unsigned j;
    540 
    541       fetch_type = lp_type_uint(type.width*4);
    542       conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
    543       dst_vec_type = lp_build_vec_type(gallivm, type);
    544       conv_vec_type = lp_build_vec_type(gallivm, conv_type);
    545       lp_build_context_init(&bld_conv, gallivm, conv_type);
    546 
    547       packed = lp_build_gather(gallivm, type.length/4,
    548                                format_desc->block.bits, fetch_type,
    549                                aligned, base_ptr, offset, TRUE);
    550 
    551       assert(format_desc->block.bits * type.length / 4 <=
    552              type.width * type.length);
    553 
    554       packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
    555 
    556       for (j = 0; j < format_desc->nr_channels; ++j) {
    557          unsigned mask = 0;
    558          unsigned sa = format_desc->channel[j].shift;
    559 
    560          mask = (1 << format_desc->channel[j].size) - 1;
    561 
    562          /* Extract bits from source */
    563          chans[j] = LLVMBuildLShr(builder, packed,
    564                                   lp_build_const_int_vec(gallivm, conv_type, sa),
    565                                   "");
    566 
    567          chans[j] = LLVMBuildAnd(builder, chans[j],
    568                                  lp_build_const_int_vec(gallivm, conv_type, mask),
    569                                  "");
    570 
    571          /* Scale bits */
    572          if (type.norm) {
    573             chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
    574                                      type.width, chans[j], conv_type);
    575          }
    576       }
    577       /*
    578        * This is a hacked lp_build_format_swizzle_soa() since we need a
    579        * normalized 1 but only 8 bits in a 32bit vector...
    580        */
    581       for (j = 0; j < 4; ++j) {
    582          enum pipe_swizzle swizzle = format_desc->swizzle[j];
    583          if (swizzle == PIPE_SWIZZLE_1) {
    584             rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
    585          } else {
    586             rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
    587          }
    588          if (j == 0) {
    589             res = rgba[j];
    590          } else {
    591             rgba[j] = LLVMBuildShl(builder, rgba[j],
    592                                    lp_build_const_int_vec(gallivm, conv_type,
    593                                                           j * type.width), "");
    594             res = LLVMBuildOr(builder, res, rgba[j], "");
    595          }
    596       }
    597       res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
    598 
    599       return res;
    600    }
    601 
    602    /*
    603     * Bit arithmetic
    604     */
    605 
    606    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
    607        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
    608         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
    609        format_desc->block.width == 1 &&
    610        format_desc->block.height == 1 &&
    611        /* XXX this shouldn't be needed */
    612        util_is_power_of_two(format_desc->block.bits) &&
    613        format_desc->block.bits <= 32 &&
    614        format_desc->is_bitmask &&
    615        !format_desc->is_mixed &&
    616        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
    617         format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
    618        !format_desc->channel[0].pure_integer) {
    619 
    620       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
    621       LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
    622       struct lp_type conv_type;
    623       unsigned k, num_conv_src, num_conv_dst;
    624 
    625       /*
    626        * Note this path is generally terrible for fetching multiple pixels.
    627        * We should make sure we cannot hit this code path for anything but
    628        * single pixels.
    629        */
    630 
    631       /*
    632        * Unpack a pixel at a time into a <4 x float> RGBA vector
    633        */
    634 
    635       for (k = 0; k < num_pixels; ++k) {
    636          LLVMValueRef packed;
    637 
    638          packed = lp_build_gather_elem(gallivm, num_pixels,
    639                                        format_desc->block.bits, 32, aligned,
    640                                        base_ptr, offset, k, FALSE);
    641 
    642          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
    643                                                   format_desc,
    644                                                   packed);
    645       }
    646 
    647       /*
    648        * Type conversion.
    649        *
    650        * TODO: We could avoid floating conversion for integer to
    651        * integer conversions.
    652        */
    653 
    654       if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
    655          debug_printf("%s: unpacking %s with floating point\n",
    656                       __FUNCTION__, format_desc->short_name);
    657       }
    658 
    659       conv_type = lp_float32_vec4_type();
    660       num_conv_src = num_pixels;
    661       num_conv_dst = 1;
    662 
    663       if (num_pixels % 8 == 0) {
    664          lp_build_concat_n(gallivm, lp_float32_vec4_type(),
    665                            tmps, num_pixels, tmps, num_pixels / 2);
    666          conv_type.length *= num_pixels / 4;
    667          num_conv_src = 4 * num_pixels / 8;
    668          if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
    669             /*
    670              * FIXME: The fast float->unorm path (which is basically
    671              * skipping the MIN/MAX which are extremely pointless in any
    672              * case) requires that there's 2 destinations...
    673              * In any case, we really should make sure we don't hit this
    674              * code with multiple pixels for unorm8 dst types, it's
    675              * completely hopeless even if we do hit the right conversion.
    676              */
    677             type.length /= num_pixels / 4;
    678             num_conv_dst = num_pixels / 4;
    679          }
    680       }
    681 
    682       lp_build_conv(gallivm, conv_type, type,
    683                     tmps, num_conv_src, res, num_conv_dst);
    684 
    685       if (num_pixels % 8 == 0 &&
    686           (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
    687          lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
    688       }
    689 
    690       return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
    691    }
    692 
    693    /* If all channels are of same type and we are not using half-floats */
    694    if (format_desc->is_array &&
    695        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
    696       assert(!format_desc->is_mixed);
    697       return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
    698    }
    699 
    700    /*
    701     * YUV / subsampled formats
    702     */
    703 
    704    if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
    705       struct lp_type tmp_type;
    706       LLVMValueRef tmp;
    707 
    708       memset(&tmp_type, 0, sizeof tmp_type);
    709       tmp_type.width = 8;
    710       tmp_type.length = num_pixels * 4;
    711       tmp_type.norm = TRUE;
    712 
    713       tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
    714                                                format_desc,
    715                                                num_pixels,
    716                                                base_ptr,
    717                                                offset,
    718                                                i, j);
    719 
    720       lp_build_conv(gallivm,
    721                     tmp_type, type,
    722                     &tmp, 1, &tmp, 1);
    723 
    724       return tmp;
    725    }
    726 
    727    /*
    728     * s3tc rgb formats
    729     */
    730 
    731    if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
    732       struct lp_type tmp_type;
    733       LLVMValueRef tmp;
    734 
    735       memset(&tmp_type, 0, sizeof tmp_type);
    736       tmp_type.width = 8;
    737       tmp_type.length = num_pixels * 4;
    738       tmp_type.norm = TRUE;
    739 
    740       tmp = lp_build_fetch_cached_texels(gallivm,
    741                                          format_desc,
    742                                          num_pixels,
    743                                          base_ptr,
    744                                          offset,
    745                                          i, j,
    746                                          cache);
    747 
    748       lp_build_conv(gallivm,
    749                     tmp_type, type,
    750                     &tmp, 1, &tmp, 1);
    751 
    752        return tmp;
    753    }
    754 
    755    /*
    756     * Fallback to util_format_description::fetch_rgba_8unorm().
    757     */
    758 
    759    if (format_desc->fetch_rgba_8unorm &&
    760        !type.floating && type.width == 8 && !type.sign && type.norm) {
    761       /*
    762        * Fallback to calling util_format_description::fetch_rgba_8unorm.
    763        *
    764        * This is definitely not the most efficient way of fetching pixels, as
    765        * we miss the opportunity to do vectorization, but this it is a
    766        * convenient for formats or scenarios for which there was no opportunity
    767        * or incentive to optimize.
    768        */
    769 
    770       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
    771       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
    772       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
    773       LLVMValueRef function;
    774       LLVMValueRef tmp_ptr;
    775       LLVMValueRef tmp;
    776       LLVMValueRef res;
    777       unsigned k;
    778 
    779       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
    780          debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
    781                       __FUNCTION__, format_desc->short_name);
    782       }
    783 
    784       /*
    785        * Declare and bind format_desc->fetch_rgba_8unorm().
    786        */
    787 
    788       {
    789          /*
    790           * Function to call looks like:
    791           *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
    792           */
    793          LLVMTypeRef ret_type;
    794          LLVMTypeRef arg_types[4];
    795          LLVMTypeRef function_type;
    796 
    797          ret_type = LLVMVoidTypeInContext(gallivm->context);
    798          arg_types[0] = pi8t;
    799          arg_types[1] = pi8t;
    800          arg_types[2] = i32t;
    801          arg_types[3] = i32t;
    802          function_type = LLVMFunctionType(ret_type, arg_types,
    803                                           ARRAY_SIZE(arg_types), 0);
    804 
    805          /* make const pointer for the C fetch_rgba_8unorm function */
    806          function = lp_build_const_int_pointer(gallivm,
    807             func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
    808 
    809          /* cast the callee pointer to the function's type */
    810          function = LLVMBuildBitCast(builder, function,
    811                                      LLVMPointerType(function_type, 0),
    812                                      "cast callee");
    813       }
    814 
    815       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
    816 
    817       res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
    818 
    819       /*
    820        * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
    821        * in the SoA vectors.
    822        */
    823 
    824       for (k = 0; k < num_pixels; ++k) {
    825          LLVMValueRef index = lp_build_const_int32(gallivm, k);
    826          LLVMValueRef args[4];
    827 
    828          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
    829          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
    830                                             base_ptr, offset, k);
    831 
    832          if (num_pixels == 1) {
    833             args[2] = i;
    834             args[3] = j;
    835          }
    836          else {
    837             args[2] = LLVMBuildExtractElement(builder, i, index, "");
    838             args[3] = LLVMBuildExtractElement(builder, j, index, "");
    839          }
    840 
    841          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
    842 
    843          tmp = LLVMBuildLoad(builder, tmp_ptr, "");
    844 
    845          if (num_pixels == 1) {
    846             res = tmp;
    847          }
    848          else {
    849             res = LLVMBuildInsertElement(builder, res, tmp, index, "");
    850          }
    851       }
    852 
    853       /* Bitcast from <n x i32> to <4n x i8> */
    854       res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
    855 
    856       return res;
    857    }
    858 
    859    /*
    860     * Fallback to util_format_description::fetch_rgba_float().
    861     */
    862 
    863    if (format_desc->fetch_rgba_float) {
    864       /*
    865        * Fallback to calling util_format_description::fetch_rgba_float.
    866        *
    867        * This is definitely not the most efficient way of fetching pixels, as
    868        * we miss the opportunity to do vectorization, but this it is a
    869        * convenient for formats or scenarios for which there was no opportunity
    870        * or incentive to optimize.
    871        */
    872 
    873       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
    874       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
    875       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
    876       LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
    877       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
    878       LLVMValueRef function;
    879       LLVMValueRef tmp_ptr;
    880       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
    881       LLVMValueRef res;
    882       unsigned k;
    883 
    884       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
    885          debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
    886                       __FUNCTION__, format_desc->short_name);
    887       }
    888 
    889       /*
    890        * Declare and bind format_desc->fetch_rgba_float().
    891        */
    892 
    893       {
    894          /*
    895           * Function to call looks like:
    896           *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
    897           */
    898          LLVMTypeRef ret_type;
    899          LLVMTypeRef arg_types[4];
    900 
    901          ret_type = LLVMVoidTypeInContext(gallivm->context);
    902          arg_types[0] = pf32t;
    903          arg_types[1] = pi8t;
    904          arg_types[2] = i32t;
    905          arg_types[3] = i32t;
    906 
    907          function = lp_build_const_func_pointer(gallivm,
    908                                                 func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
    909                                                 ret_type,
    910                                                 arg_types, ARRAY_SIZE(arg_types),
    911                                                 format_desc->short_name);
    912       }
    913 
    914       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
    915 
    916       /*
    917        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
    918        * in the SoA vectors.
    919        */
    920 
    921       for (k = 0; k < num_pixels; ++k) {
    922          LLVMValueRef args[4];
    923 
    924          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
    925          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
    926                                             base_ptr, offset, k);
    927 
    928          if (num_pixels == 1) {
    929             args[2] = i;
    930             args[3] = j;
    931          }
    932          else {
    933             LLVMValueRef index = lp_build_const_int32(gallivm, k);
    934             args[2] = LLVMBuildExtractElement(builder, i, index, "");
    935             args[3] = LLVMBuildExtractElement(builder, j, index, "");
    936          }
    937 
    938          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
    939 
    940          tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
    941       }
    942 
    943       lp_build_conv(gallivm,
    944                     lp_float32_vec4_type(),
    945                     type,
    946                     tmps, num_pixels, &res, 1);
    947 
    948       return res;
    949    }
    950 
    951    assert(!util_format_is_pure_integer(format_desc->format));
    952 
    953    assert(0);
    954    return lp_build_undef(gallivm, type);
    955 }
    956