Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 /**
     29  * @file
     30  * Texture sampling -- SoA.
     31  *
     32  * @author Jose Fonseca <jfonseca (at) vmware.com>
     33  * @author Brian Paul <brianp (at) vmware.com>
     34  */
     35 
     36 #include "pipe/p_defines.h"
     37 #include "pipe/p_state.h"
     38 #include "pipe/p_shader_tokens.h"
     39 #include "util/u_debug.h"
     40 #include "util/u_dump.h"
     41 #include "util/u_memory.h"
     42 #include "util/u_math.h"
     43 #include "util/u_format.h"
     44 #include "util/u_cpu_detect.h"
     45 #include "util/format_rgb9e5.h"
     46 #include "lp_bld_debug.h"
     47 #include "lp_bld_type.h"
     48 #include "lp_bld_const.h"
     49 #include "lp_bld_conv.h"
     50 #include "lp_bld_arit.h"
     51 #include "lp_bld_bitarit.h"
     52 #include "lp_bld_logic.h"
     53 #include "lp_bld_printf.h"
     54 #include "lp_bld_swizzle.h"
     55 #include "lp_bld_flow.h"
     56 #include "lp_bld_gather.h"
     57 #include "lp_bld_format.h"
     58 #include "lp_bld_sample.h"
     59 #include "lp_bld_sample_aos.h"
     60 #include "lp_bld_struct.h"
     61 #include "lp_bld_quad.h"
     62 #include "lp_bld_pack.h"
     63 #include "lp_bld_intr.h"
     64 
     65 
     66 /**
     67  * Generate code to fetch a texel from a texture at int coords (x, y, z).
     68  * The computation depends on whether the texture is 1D, 2D or 3D.
     69  * The result, texel, will be float vectors:
     70  *   texel[0] = red values
     71  *   texel[1] = green values
     72  *   texel[2] = blue values
     73  *   texel[3] = alpha values
     74  */
     75 static void
     76 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
     77                           LLVMValueRef width,
     78                           LLVMValueRef height,
     79                           LLVMValueRef depth,
     80                           LLVMValueRef x,
     81                           LLVMValueRef y,
     82                           LLVMValueRef z,
     83                           LLVMValueRef y_stride,
     84                           LLVMValueRef z_stride,
     85                           LLVMValueRef data_ptr,
     86                           LLVMValueRef mipoffsets,
     87                           LLVMValueRef texel_out[4])
     88 {
     89    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
     90    const unsigned dims = bld->dims;
     91    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
     92    LLVMBuilderRef builder = bld->gallivm->builder;
     93    LLVMValueRef offset;
     94    LLVMValueRef i, j;
     95    LLVMValueRef use_border = NULL;
     96 
     97    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
     98    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
     99                                               static_state->min_img_filter,
    100                                               static_state->mag_img_filter)) {
    101       LLVMValueRef b1, b2;
    102       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
    103       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
    104       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
    105    }
    106 
    107    if (dims >= 2 &&
    108        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
    109                                               static_state->min_img_filter,
    110                                               static_state->mag_img_filter)) {
    111       LLVMValueRef b1, b2;
    112       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
    113       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
    114       if (use_border) {
    115          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
    116          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
    117       }
    118       else {
    119          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
    120       }
    121    }
    122 
    123    if (dims == 3 &&
    124        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
    125                                               static_state->min_img_filter,
    126                                               static_state->mag_img_filter)) {
    127       LLVMValueRef b1, b2;
    128       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
    129       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
    130       if (use_border) {
    131          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
    132          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
    133       }
    134       else {
    135          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
    136       }
    137    }
    138 
    139    /* convert x,y,z coords to linear offset from start of texture, in bytes */
    140    lp_build_sample_offset(&bld->int_coord_bld,
    141                           bld->format_desc,
    142                           x, y, z, y_stride, z_stride,
    143                           &offset, &i, &j);
    144    if (mipoffsets) {
    145       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
    146    }
    147 
    148    if (use_border) {
    149       /* If we can sample the border color, it means that texcoords may
    150        * lie outside the bounds of the texture image.  We need to do
    151        * something to prevent reading out of bounds and causing a segfault.
    152        *
    153        * Simply AND the texture coords with !use_border.  This will cause
    154        * coords which are out of bounds to become zero.  Zero's guaranteed
    155        * to be inside the texture image.
    156        */
    157       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
    158    }
    159 
    160    lp_build_fetch_rgba_soa(bld->gallivm,
    161                            bld->format_desc,
    162                            bld->texel_type, TRUE,
    163                            data_ptr, offset,
    164                            i, j,
    165                            bld->cache,
    166                            texel_out);
    167 
    168    /*
    169     * Note: if we find an app which frequently samples the texture border
    170     * we might want to implement a true conditional here to avoid sampling
    171     * the texture whenever possible (since that's quite a bit of code).
    172     * Ex:
    173     *   if (use_border) {
    174     *      texel = border_color;
    175     *   }
    176     *   else {
    177     *      texel = sample_texture(coord);
    178     *   }
    179     * As it is now, we always sample the texture, then selectively replace
    180     * the texel color results with the border color.
    181     */
    182 
    183    if (use_border) {
    184       /* select texel color or border color depending on use_border. */
    185       const struct util_format_description *format_desc = bld->format_desc;
    186       int chan;
    187       struct lp_type border_type = bld->texel_type;
    188       border_type.length = 4;
    189       /*
    190        * Only replace channels which are actually present. The others should
    191        * get optimized away eventually by sampler_view swizzle anyway but it's
    192        * easier too.
    193        */
    194       for (chan = 0; chan < 4; chan++) {
    195          unsigned chan_s;
    196          /* reverse-map channel... */
    197          for (chan_s = 0; chan_s < 4; chan_s++) {
    198             if (chan_s == format_desc->swizzle[chan]) {
    199                break;
    200             }
    201          }
    202          if (chan_s <= 3) {
    203             /* use the already clamped color */
    204             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
    205             LLVMValueRef border_chan;
    206 
    207             border_chan = lp_build_extract_broadcast(bld->gallivm,
    208                                                      border_type,
    209                                                      bld->texel_type,
    210                                                      bld->border_color_clamped,
    211                                                      idx);
    212             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
    213                                               border_chan, texel_out[chan]);
    214          }
    215       }
    216    }
    217 }
    218 
    219 
    220 /**
    221  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
    222  * (Note that with pot sizes could do this much more easily post-scale
    223  * with some bit arithmetic.)
    224  */
    225 static LLVMValueRef
    226 lp_build_coord_mirror(struct lp_build_sample_context *bld,
    227                       LLVMValueRef coord, boolean posOnly)
    228 {
    229    struct lp_build_context *coord_bld = &bld->coord_bld;
    230    LLVMValueRef fract;
    231    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
    232 
    233    /*
    234     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
    235     * it all works out. (The result is in range [-1, 1.0], negative if
    236     * the coord is in the "odd" section, otherwise positive.)
    237     */
    238 
    239    coord = lp_build_mul(coord_bld, coord, half);
    240    fract = lp_build_round(coord_bld, coord);
    241    fract = lp_build_sub(coord_bld, coord, fract);
    242    coord = lp_build_add(coord_bld, fract, fract);
    243 
    244    if (posOnly) {
    245       /*
    246        * Theoretically it's not quite 100% accurate because the spec says
    247        * that ultimately a scaled coord of -x.0 should map to int coord
    248        * -x + 1 with mirroring, not -x (this does not matter for bilinear
    249        * filtering).
    250        */
    251       coord = lp_build_abs(coord_bld, coord);
    252       /* kill off NaNs */
    253       /* XXX: not safe without arch rounding, fract can be anything. */
    254       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
    255                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
    256    }
    257 
    258    return coord;
    259 }
    260 
    261 
    262 /**
    263  * Helper to compute the first coord and the weight for
    264  * linear wrap repeat npot textures
    265  */
    266 void
    267 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
    268                                   LLVMValueRef coord_f,
    269                                   LLVMValueRef length_i,
    270                                   LLVMValueRef length_f,
    271                                   LLVMValueRef *coord0_i,
    272                                   LLVMValueRef *weight_f)
    273 {
    274    struct lp_build_context *coord_bld = &bld->coord_bld;
    275    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    276    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
    277    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
    278                                                 int_coord_bld->one);
    279    LLVMValueRef mask;
    280    /* wrap with normalized floats is just fract */
    281    coord_f = lp_build_fract(coord_bld, coord_f);
    282    /* mul by size and subtract 0.5 */
    283    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
    284    coord_f = lp_build_sub(coord_bld, coord_f, half);
    285    /*
    286     * we avoided the 0.5/length division before the repeat wrap,
    287     * now need to fix up edge cases with selects
    288     */
    289    /*
    290     * Note we do a float (unordered) compare so we can eliminate NaNs.
    291     * (Otherwise would need fract_safe above).
    292     */
    293    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
    294                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
    295 
    296    /* convert to int, compute lerp weight */
    297    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
    298    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
    299 }
    300 
    301 
    302 /**
    303  * Build LLVM code for texture wrap mode for linear filtering.
    304  * \param x0_out  returns first integer texcoord
    305  * \param x1_out  returns second integer texcoord
    306  * \param weight_out  returns linear interpolation weight
    307  */
    308 static void
    309 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    310                             boolean is_gather,
    311                             LLVMValueRef coord,
    312                             LLVMValueRef length,
    313                             LLVMValueRef length_f,
    314                             LLVMValueRef offset,
    315                             boolean is_pot,
    316                             unsigned wrap_mode,
    317                             LLVMValueRef *x0_out,
    318                             LLVMValueRef *x1_out,
    319                             LLVMValueRef *weight_out)
    320 {
    321    struct lp_build_context *coord_bld = &bld->coord_bld;
    322    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    323    LLVMBuilderRef builder = bld->gallivm->builder;
    324    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
    325    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    326    LLVMValueRef coord0, coord1, weight;
    327 
    328    switch(wrap_mode) {
    329    case PIPE_TEX_WRAP_REPEAT:
    330       if (is_pot) {
    331          /* mul by size and subtract 0.5 */
    332          coord = lp_build_mul(coord_bld, coord, length_f);
    333          coord = lp_build_sub(coord_bld, coord, half);
    334          if (offset) {
    335             offset = lp_build_int_to_float(coord_bld, offset);
    336             coord = lp_build_add(coord_bld, coord, offset);
    337          }
    338          /* convert to int, compute lerp weight */
    339          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
    340          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    341          /* repeat wrap */
    342          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
    343          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
    344       }
    345       else {
    346          LLVMValueRef mask;
    347          if (offset) {
    348             offset = lp_build_int_to_float(coord_bld, offset);
    349             offset = lp_build_div(coord_bld, offset, length_f);
    350             coord = lp_build_add(coord_bld, coord, offset);
    351          }
    352          lp_build_coord_repeat_npot_linear(bld, coord,
    353                                            length, length_f,
    354                                            &coord0, &weight);
    355          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
    356                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
    357          coord1 = LLVMBuildAnd(builder,
    358                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
    359                                mask, "");
    360       }
    361       break;
    362 
    363    case PIPE_TEX_WRAP_CLAMP:
    364       if (bld->static_sampler_state->normalized_coords) {
    365          /* scale coord to length */
    366          coord = lp_build_mul(coord_bld, coord, length_f);
    367       }
    368       if (offset) {
    369          offset = lp_build_int_to_float(coord_bld, offset);
    370          coord = lp_build_add(coord_bld, coord, offset);
    371       }
    372 
    373       /*
    374        * clamp to [0, length]
    375        *
    376        * Unlike some other wrap modes, this should be correct for gather
    377        * too. GL_CLAMP explicitly does this clamp on the coord prior to
    378        * actual wrapping (which is per sample).
    379        */
    380       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
    381 
    382       coord = lp_build_sub(coord_bld, coord, half);
    383 
    384       /* convert to int, compute lerp weight */
    385       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
    386       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    387       break;
    388 
    389    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
    390       {
    391          struct lp_build_context abs_coord_bld = bld->coord_bld;
    392          abs_coord_bld.type.sign = FALSE;
    393 
    394          if (bld->static_sampler_state->normalized_coords) {
    395             /* mul by tex size */
    396             coord = lp_build_mul(coord_bld, coord, length_f);
    397          }
    398          if (offset) {
    399             offset = lp_build_int_to_float(coord_bld, offset);
    400             coord = lp_build_add(coord_bld, coord, offset);
    401          }
    402 
    403          /* clamp to length max */
    404          coord = lp_build_min_ext(coord_bld, coord, length_f,
    405                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
    406          if (!is_gather) {
    407             /* subtract 0.5 */
    408             coord = lp_build_sub(coord_bld, coord, half);
    409             /* clamp to [0, length - 0.5] */
    410             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
    411             /* convert to int, compute lerp weight */
    412             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
    413             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    414          } else {
    415             /*
    416              * The non-gather path will end up with coords 0, 1 if coord was
    417              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
    418              * really matter what the second coord is). But for gather, we
    419              * really need to end up with coords 0, 0.
    420              */
    421             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
    422             coord0 = lp_build_sub(coord_bld, coord, half);
    423             coord1 = lp_build_add(coord_bld, coord, half);
    424             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
    425             coord0 = lp_build_itrunc(coord_bld, coord0);
    426             coord1 = lp_build_itrunc(coord_bld, coord1);
    427             weight = coord_bld->undef;
    428          }
    429          /* coord1 = min(coord1, length-1) */
    430          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
    431          break;
    432       }
    433 
    434    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
    435       if (bld->static_sampler_state->normalized_coords) {
    436          /* scale coord to length */
    437          coord = lp_build_mul(coord_bld, coord, length_f);
    438       }
    439       if (offset) {
    440          offset = lp_build_int_to_float(coord_bld, offset);
    441          coord = lp_build_add(coord_bld, coord, offset);
    442       }
    443       /*
    444        * We don't need any clamp. Technically, for very large (pos or neg)
    445        * (or infinite) values, clamp against [-length, length] would be
    446        * correct, but we don't need to guarantee any specific
    447        * result for such coords (the ifloor will be undefined, but for modes
    448        * requiring border all resulting coords are safe).
    449        */
    450       coord = lp_build_sub(coord_bld, coord, half);
    451       /* convert to int, compute lerp weight */
    452       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
    453       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    454       break;
    455 
    456    case PIPE_TEX_WRAP_MIRROR_REPEAT:
    457       if (offset) {
    458          offset = lp_build_int_to_float(coord_bld, offset);
    459          offset = lp_build_div(coord_bld, offset, length_f);
    460          coord = lp_build_add(coord_bld, coord, offset);
    461       }
    462       if (!is_gather) {
    463          /* compute mirror function */
    464          coord = lp_build_coord_mirror(bld, coord, TRUE);
    465 
    466          /* scale coord to length */
    467          coord = lp_build_mul(coord_bld, coord, length_f);
    468          coord = lp_build_sub(coord_bld, coord, half);
    469 
    470          /* convert to int, compute lerp weight */
    471          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
    472          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    473 
    474          /* coord0 = max(coord0, 0) */
    475          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
    476          /* coord1 = min(coord1, length-1) */
    477          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
    478       } else {
    479          /*
    480           * This is pretty reasonable in the end,  all what the tests care
    481           * about is nasty edge cases (scaled coords x.5, so the individual
    482           * coords are actually integers, which is REALLY tricky to get right
    483           * due to this working differently both for negative numbers as well
    484           * as for even/odd cases). But with enough magic it's not too complex
    485           * after all.
    486           * Maybe should try a bit arithmetic one though for POT textures...
    487           */
    488          LLVMValueRef isNeg;
    489          /*
    490           * Wrapping just once still works, even though it means we can
    491           * get "wrong" sign due to performing mirror in the middle of the
    492           * two coords (because this can only happen very near the odd/even
    493           * edges, so both coords will actually end up as 0 or length - 1
    494           * in the end).
    495           * For GL4 gather with per-sample offsets we'd need to the mirroring
    496           * per coord too.
    497           */
    498          coord = lp_build_coord_mirror(bld, coord, FALSE);
    499          coord = lp_build_mul(coord_bld, coord, length_f);
    500 
    501          /*
    502           * NaNs should be safe here, we'll do away with them with
    503           * the ones' complement plus min.
    504           */
    505          coord0 = lp_build_sub(coord_bld, coord, half);
    506          coord0 = lp_build_ifloor(coord_bld, coord0);
    507          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    508          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
    509          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
    510                               coord0, int_coord_bld->zero);
    511          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
    512          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
    513                               coord1, int_coord_bld->zero);
    514          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
    515          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
    516          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
    517 
    518          weight = coord_bld->undef;
    519       }
    520       break;
    521 
    522    case PIPE_TEX_WRAP_MIRROR_CLAMP:
    523       if (bld->static_sampler_state->normalized_coords) {
    524          /* scale coord to length */
    525          coord = lp_build_mul(coord_bld, coord, length_f);
    526       }
    527       if (offset) {
    528          offset = lp_build_int_to_float(coord_bld, offset);
    529          coord = lp_build_add(coord_bld, coord, offset);
    530       }
    531       /*
    532        * XXX: probably not correct for gather, albeit I'm not
    533        * entirely sure as it's poorly specified. The wrapping looks
    534        * correct according to the spec which is against gl 1.2.1,
    535        * however negative values will be swapped - gl re-specified
    536        * wrapping with newer versions (no more pre-clamp except with
    537        * GL_CLAMP).
    538        */
    539       coord = lp_build_abs(coord_bld, coord);
    540 
    541       /* clamp to [0, length] */
    542       coord = lp_build_min_ext(coord_bld, coord, length_f,
    543                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
    544 
    545       coord = lp_build_sub(coord_bld, coord, half);
    546 
    547       /* convert to int, compute lerp weight */
    548       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
    549       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    550       break;
    551 
    552    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
    553       {
    554          struct lp_build_context abs_coord_bld = bld->coord_bld;
    555          abs_coord_bld.type.sign = FALSE;
    556 
    557          if (bld->static_sampler_state->normalized_coords) {
    558             /* scale coord to length */
    559             coord = lp_build_mul(coord_bld, coord, length_f);
    560          }
    561          if (offset) {
    562             offset = lp_build_int_to_float(coord_bld, offset);
    563             coord = lp_build_add(coord_bld, coord, offset);
    564          }
    565          if (!is_gather) {
    566             coord = lp_build_abs(coord_bld, coord);
    567 
    568             /* clamp to length max */
    569             coord = lp_build_min_ext(coord_bld, coord, length_f,
    570                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
    571             /* subtract 0.5 */
    572             coord = lp_build_sub(coord_bld, coord, half);
    573             /* clamp to [0, length - 0.5] */
    574             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
    575 
    576             /* convert to int, compute lerp weight */
    577             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
    578             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    579             /* coord1 = min(coord1, length-1) */
    580             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
    581          } else {
    582             /*
    583              * The non-gather path will swap coord0/1 if coord was negative,
    584              * which is ok for filtering since the filter weight matches
    585              * accordingly. Also, if coord is close to zero, coord0/1 will
    586              * be 0 and 1, instead of 0 and 0 (again ok due to filter
    587              * weight being 0.0). Both issues need to be fixed for gather.
    588              */
    589             LLVMValueRef isNeg;
    590 
    591             /*
    592              * Actually wanted to cheat here and use:
    593              * coord1 = lp_build_iround(coord_bld, coord);
    594              * but it's not good enough for some tests (even piglit
    595              * textureGather is set up in a way so the coords area always
    596              * .5, that is right at the crossover points).
    597              * So do ordinary sub/floor, then do ones' complement
    598              * for negative numbers.
    599              * (Note can't just do sub|add/abs/itrunc per coord neither -
    600              * because the spec demands that mirror(3.0) = 3 but
    601              * mirror(-3.0) = 2.)
    602              */
    603             coord = lp_build_sub(coord_bld, coord, half);
    604             coord0 = lp_build_ifloor(coord_bld, coord);
    605             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    606             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
    607                                  int_coord_bld->zero);
    608             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
    609             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
    610 
    611             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
    612                                  int_coord_bld->zero);
    613             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
    614             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
    615 
    616             weight = coord_bld->undef;
    617          }
    618       }
    619       break;
    620 
    621    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
    622       {
    623          if (bld->static_sampler_state->normalized_coords) {
    624             /* scale coord to length */
    625             coord = lp_build_mul(coord_bld, coord, length_f);
    626          }
    627          if (offset) {
    628             offset = lp_build_int_to_float(coord_bld, offset);
    629             coord = lp_build_add(coord_bld, coord, offset);
    630          }
    631          /*
    632           * XXX: probably not correct for gather due to swapped
    633           * order if coord is negative (same rationale as for
    634           * MIRROR_CLAMP).
    635           */
    636          coord = lp_build_abs(coord_bld, coord);
    637 
    638          /*
    639           * We don't need any clamp. Technically, for very large
    640           * (or infinite) values, clamp against length would be
    641           * correct, but we don't need to guarantee any specific
    642           * result for such coords (the ifloor will be undefined, but
    643           * for modes requiring border all resulting coords are safe).
    644           */
    645          coord = lp_build_sub(coord_bld, coord, half);
    646 
    647          /* convert to int, compute lerp weight */
    648          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
    649          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
    650       }
    651       break;
    652 
    653    default:
    654       assert(0);
    655       coord0 = NULL;
    656       coord1 = NULL;
    657       weight = NULL;
    658    }
    659 
    660    *x0_out = coord0;
    661    *x1_out = coord1;
    662    *weight_out = weight;
    663 }
    664 
    665 
    666 /**
    667  * Build LLVM code for texture wrap mode for nearest filtering.
    668  * \param coord  the incoming texcoord (nominally in [0,1])
    669  * \param length  the texture size along one dimension, as int vector
    670  * \param length_f  the texture size along one dimension, as float vector
    671  * \param offset  texel offset along one dimension (as int vector)
    672  * \param is_pot  if TRUE, length is a power of two
    673  * \param wrap_mode  one of PIPE_TEX_WRAP_x
    674  */
    675 static LLVMValueRef
    676 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    677                              LLVMValueRef coord,
    678                              LLVMValueRef length,
    679                              LLVMValueRef length_f,
    680                              LLVMValueRef offset,
    681                              boolean is_pot,
    682                              unsigned wrap_mode)
    683 {
    684    struct lp_build_context *coord_bld = &bld->coord_bld;
    685    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    686    LLVMBuilderRef builder = bld->gallivm->builder;
    687    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    688    LLVMValueRef icoord;
    689 
    690    switch(wrap_mode) {
    691    case PIPE_TEX_WRAP_REPEAT:
    692       if (is_pot) {
    693          coord = lp_build_mul(coord_bld, coord, length_f);
    694          icoord = lp_build_ifloor(coord_bld, coord);
    695          if (offset) {
    696             icoord = lp_build_add(int_coord_bld, icoord, offset);
    697          }
    698          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
    699       }
    700       else {
    701           if (offset) {
    702              offset = lp_build_int_to_float(coord_bld, offset);
    703              offset = lp_build_div(coord_bld, offset, length_f);
    704              coord = lp_build_add(coord_bld, coord, offset);
    705           }
    706           /* take fraction, unnormalize */
    707           coord = lp_build_fract_safe(coord_bld, coord);
    708           coord = lp_build_mul(coord_bld, coord, length_f);
    709           icoord = lp_build_itrunc(coord_bld, coord);
    710       }
    711       break;
    712 
    713    case PIPE_TEX_WRAP_CLAMP:
    714    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
    715       if (bld->static_sampler_state->normalized_coords) {
    716          /* scale coord to length */
    717          coord = lp_build_mul(coord_bld, coord, length_f);
    718       }
    719 
    720       if (offset) {
    721          offset = lp_build_int_to_float(coord_bld, offset);
    722          coord = lp_build_add(coord_bld, coord, offset);
    723       }
    724       /* floor */
    725       /* use itrunc instead since we clamp to 0 anyway */
    726       icoord = lp_build_itrunc(coord_bld, coord);
    727 
    728       /* clamp to [0, length - 1]. */
    729       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
    730                               length_minus_one);
    731       break;
    732 
    733    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
    734       if (bld->static_sampler_state->normalized_coords) {
    735          /* scale coord to length */
    736          coord = lp_build_mul(coord_bld, coord, length_f);
    737       }
    738       /* no clamp necessary, border masking will handle this */
    739       icoord = lp_build_ifloor(coord_bld, coord);
    740       if (offset) {
    741          icoord = lp_build_add(int_coord_bld, icoord, offset);
    742       }
    743       break;
    744 
    745    case PIPE_TEX_WRAP_MIRROR_REPEAT:
    746       if (offset) {
    747          offset = lp_build_int_to_float(coord_bld, offset);
    748          offset = lp_build_div(coord_bld, offset, length_f);
    749          coord = lp_build_add(coord_bld, coord, offset);
    750       }
    751       /* compute mirror function */
    752       coord = lp_build_coord_mirror(bld, coord, TRUE);
    753 
    754       /* scale coord to length */
    755       assert(bld->static_sampler_state->normalized_coords);
    756       coord = lp_build_mul(coord_bld, coord, length_f);
    757 
    758       /* itrunc == ifloor here */
    759       icoord = lp_build_itrunc(coord_bld, coord);
    760 
    761       /* clamp to [0, length - 1] */
    762       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
    763       break;
    764 
    765    case PIPE_TEX_WRAP_MIRROR_CLAMP:
    766    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
    767       if (bld->static_sampler_state->normalized_coords) {
    768          /* scale coord to length */
    769          coord = lp_build_mul(coord_bld, coord, length_f);
    770       }
    771       if (offset) {
    772          offset = lp_build_int_to_float(coord_bld, offset);
    773          coord = lp_build_add(coord_bld, coord, offset);
    774       }
    775       coord = lp_build_abs(coord_bld, coord);
    776 
    777       /* itrunc == ifloor here */
    778       icoord = lp_build_itrunc(coord_bld, coord);
    779       /*
    780        * Use unsigned min due to possible undef values (NaNs, overflow)
    781        */
    782       {
    783          struct lp_build_context abs_coord_bld = *int_coord_bld;
    784          abs_coord_bld.type.sign = FALSE;
    785          /* clamp to [0, length - 1] */
    786          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
    787       }
    788       break;
    789 
    790    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
    791       if (bld->static_sampler_state->normalized_coords) {
    792          /* scale coord to length */
    793          coord = lp_build_mul(coord_bld, coord, length_f);
    794       }
    795       if (offset) {
    796          offset = lp_build_int_to_float(coord_bld, offset);
    797          coord = lp_build_add(coord_bld, coord, offset);
    798       }
    799       coord = lp_build_abs(coord_bld, coord);
    800 
    801       /* itrunc == ifloor here */
    802       icoord = lp_build_itrunc(coord_bld, coord);
    803       break;
    804 
    805    default:
    806       assert(0);
    807       icoord = NULL;
    808    }
    809 
    810    return icoord;
    811 }
    812 
    813 
    814 /**
    815  * Do shadow test/comparison.
    816  * \param p shadow ref value
    817  * \param texel  the texel to compare against
    818  */
    819 static LLVMValueRef
    820 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
    821                             LLVMValueRef p,
    822                             LLVMValueRef texel)
    823 {
    824    struct lp_build_context *texel_bld = &bld->texel_bld;
    825    LLVMValueRef res;
    826 
    827    if (0) {
    828       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
    829       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
    830    }
    831 
    832    /* result = (p FUNC texel) ? 1 : 0 */
    833    /*
    834     * honor d3d10 floating point rules here, which state that comparisons
    835     * are ordered except NOT_EQUAL which is unordered.
    836     */
    837    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
    838       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
    839                                  p, texel);
    840    }
    841    else {
    842       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
    843                          p, texel);
    844    }
    845    return res;
    846 }
    847 
    848 
    849 /**
    850  * Generate code to sample a mipmap level with nearest filtering.
    851  * If sampling a cube texture, r = cube face in [0,5].
    852  */
    853 static void
    854 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    855                               LLVMValueRef size,
    856                               LLVMValueRef row_stride_vec,
    857                               LLVMValueRef img_stride_vec,
    858                               LLVMValueRef data_ptr,
    859                               LLVMValueRef mipoffsets,
    860                               const LLVMValueRef *coords,
    861                               const LLVMValueRef *offsets,
    862                               LLVMValueRef colors_out[4])
    863 {
    864    const unsigned dims = bld->dims;
    865    LLVMValueRef width_vec;
    866    LLVMValueRef height_vec;
    867    LLVMValueRef depth_vec;
    868    LLVMValueRef flt_size;
    869    LLVMValueRef flt_width_vec;
    870    LLVMValueRef flt_height_vec;
    871    LLVMValueRef flt_depth_vec;
    872    LLVMValueRef x, y = NULL, z = NULL;
    873 
    874    lp_build_extract_image_sizes(bld,
    875                                 &bld->int_size_bld,
    876                                 bld->int_coord_type,
    877                                 size,
    878                                 &width_vec, &height_vec, &depth_vec);
    879 
    880    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
    881 
    882    lp_build_extract_image_sizes(bld,
    883                                 &bld->float_size_bld,
    884                                 bld->coord_type,
    885                                 flt_size,
    886                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
    887 
    888    /*
    889     * Compute integer texcoords.
    890     */
    891    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
    892                                     flt_width_vec, offsets[0],
    893                                     bld->static_texture_state->pot_width,
    894                                     bld->static_sampler_state->wrap_s);
    895    lp_build_name(x, "tex.x.wrapped");
    896 
    897    if (dims >= 2) {
    898       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
    899                                        flt_height_vec, offsets[1],
    900                                        bld->static_texture_state->pot_height,
    901                                        bld->static_sampler_state->wrap_t);
    902       lp_build_name(y, "tex.y.wrapped");
    903 
    904       if (dims == 3) {
    905          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
    906                                           flt_depth_vec, offsets[2],
    907                                           bld->static_texture_state->pot_depth,
    908                                           bld->static_sampler_state->wrap_r);
    909          lp_build_name(z, "tex.z.wrapped");
    910       }
    911    }
    912    if (has_layer_coord(bld->static_texture_state->target)) {
    913       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
    914          /* add cube layer to face */
    915          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
    916       }
    917       else {
    918          z = coords[2];
    919       }
    920       lp_build_name(z, "tex.z.layer");
    921    }
    922 
    923    /*
    924     * Get texture colors.
    925     */
    926    lp_build_sample_texel_soa(bld,
    927                              width_vec, height_vec, depth_vec,
    928                              x, y, z,
    929                              row_stride_vec, img_stride_vec,
    930                              data_ptr, mipoffsets, colors_out);
    931 
    932    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
    933       LLVMValueRef cmpval;
    934       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
    935       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
    936       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
    937                                       bld->texel_bld.one, bld->texel_bld.zero);
    938       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
    939    }
    940 
    941 }
    942 
    943 
    944 /**
    945  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
    946  */
    947 static LLVMValueRef
    948 lp_build_masklerp(struct lp_build_context *bld,
    949                  LLVMValueRef weight,
    950                  LLVMValueRef mask0,
    951                  LLVMValueRef mask1)
    952 {
    953    struct gallivm_state *gallivm = bld->gallivm;
    954    LLVMBuilderRef builder = gallivm->builder;
    955    LLVMValueRef weight2;
    956 
    957    weight2 = lp_build_sub(bld, bld->one, weight);
    958    weight = LLVMBuildBitCast(builder, weight,
    959                               lp_build_int_vec_type(gallivm, bld->type), "");
    960    weight2 = LLVMBuildBitCast(builder, weight2,
    961                               lp_build_int_vec_type(gallivm, bld->type), "");
    962    weight = LLVMBuildAnd(builder, weight, mask1, "");
    963    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
    964    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
    965    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
    966    return lp_build_add(bld, weight, weight2);
    967 }
    968 
    969 /**
    970  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
    971  */
    972 static LLVMValueRef
    973 lp_build_masklerp2d(struct lp_build_context *bld,
    974                     LLVMValueRef weight0,
    975                     LLVMValueRef weight1,
    976                     LLVMValueRef mask00,
    977                     LLVMValueRef mask01,
    978                     LLVMValueRef mask10,
    979                     LLVMValueRef mask11)
    980 {
    981    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
    982    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
    983    return lp_build_lerp(bld, weight1, val0, val1, 0);
    984 }
    985 
    986 /*
    987  * this is a bit excessive code for something OpenGL just recommends
    988  * but does not require.
    989  */
    990 #define ACCURATE_CUBE_CORNERS 1
    991 
    992 /**
    993  * Generate code to sample a mipmap level with linear filtering.
    994  * If sampling a cube texture, r = cube face in [0,5].
    995  * If linear_mask is present, only pixels having their mask set
    996  * will receive linear filtering, the rest will use nearest.
    997  */
    998 static void
    999 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
   1000                              boolean is_gather,
   1001                              LLVMValueRef size,
   1002                              LLVMValueRef linear_mask,
   1003                              LLVMValueRef row_stride_vec,
   1004                              LLVMValueRef img_stride_vec,
   1005                              LLVMValueRef data_ptr,
   1006                              LLVMValueRef mipoffsets,
   1007                              const LLVMValueRef *coords,
   1008                              const LLVMValueRef *offsets,
   1009                              LLVMValueRef colors_out[4])
   1010 {
   1011    LLVMBuilderRef builder = bld->gallivm->builder;
   1012    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
   1013    struct lp_build_context *coord_bld = &bld->coord_bld;
   1014    struct lp_build_context *texel_bld = &bld->texel_bld;
   1015    const unsigned dims = bld->dims;
   1016    LLVMValueRef width_vec;
   1017    LLVMValueRef height_vec;
   1018    LLVMValueRef depth_vec;
   1019    LLVMValueRef flt_size;
   1020    LLVMValueRef flt_width_vec;
   1021    LLVMValueRef flt_height_vec;
   1022    LLVMValueRef flt_depth_vec;
   1023    LLVMValueRef fall_off[4], have_corners;
   1024    LLVMValueRef z1 = NULL;
   1025    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
   1026    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
   1027    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
   1028    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
   1029    LLVMValueRef xs[4], ys[4], zs[4];
   1030    LLVMValueRef neighbors[2][2][4];
   1031    int chan, texel_index;
   1032    boolean seamless_cube_filter, accurate_cube_corners;
   1033    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
   1034 
   1035    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
   1036                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
   1037                           bld->static_sampler_state->seamless_cube_map;
   1038 
   1039    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
   1040 
   1041    lp_build_extract_image_sizes(bld,
   1042                                 &bld->int_size_bld,
   1043                                 bld->int_coord_type,
   1044                                 size,
   1045                                 &width_vec, &height_vec, &depth_vec);
   1046 
   1047    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
   1048 
   1049    lp_build_extract_image_sizes(bld,
   1050                                 &bld->float_size_bld,
   1051                                 bld->coord_type,
   1052                                 flt_size,
   1053                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
   1054 
   1055    /*
   1056     * Compute integer texcoords.
   1057     */
   1058 
   1059    if (!seamless_cube_filter) {
   1060       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
   1061                                   flt_width_vec, offsets[0],
   1062                                   bld->static_texture_state->pot_width,
   1063                                   bld->static_sampler_state->wrap_s,
   1064                                   &x00, &x01, &s_fpart);
   1065       lp_build_name(x00, "tex.x0.wrapped");
   1066       lp_build_name(x01, "tex.x1.wrapped");
   1067       x10 = x00;
   1068       x11 = x01;
   1069 
   1070       if (dims >= 2) {
   1071          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
   1072                                      flt_height_vec, offsets[1],
   1073                                      bld->static_texture_state->pot_height,
   1074                                      bld->static_sampler_state->wrap_t,
   1075                                      &y00, &y10, &t_fpart);
   1076          lp_build_name(y00, "tex.y0.wrapped");
   1077          lp_build_name(y10, "tex.y1.wrapped");
   1078          y01 = y00;
   1079          y11 = y10;
   1080 
   1081          if (dims == 3) {
   1082             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
   1083                                         flt_depth_vec, offsets[2],
   1084                                         bld->static_texture_state->pot_depth,
   1085                                         bld->static_sampler_state->wrap_r,
   1086                                         &z00, &z1, &r_fpart);
   1087             z01 = z10 = z11 = z00;
   1088             lp_build_name(z00, "tex.z0.wrapped");
   1089             lp_build_name(z1, "tex.z1.wrapped");
   1090          }
   1091       }
   1092       if (has_layer_coord(bld->static_texture_state->target)) {
   1093          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
   1094             /* add cube layer to face */
   1095             z00 = z01 = z10 = z11 = z1 =
   1096                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
   1097          }
   1098          else {
   1099             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
   1100          }
   1101          lp_build_name(z00, "tex.z0.layer");
   1102          lp_build_name(z1, "tex.z1.layer");
   1103       }
   1104    }
   1105    else {
   1106       struct lp_build_if_state edge_if;
   1107       LLVMTypeRef int1t;
   1108       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
   1109       LLVMValueRef coord0, coord1, have_edge, have_corner;
   1110       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
   1111       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
   1112       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
   1113       LLVMValueRef face = coords[2];
   1114       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
   1115       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
   1116       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
   1117       height_vec = width_vec;
   1118       flt_height_vec = flt_width_vec;
   1119 
   1120       /* XXX the overflow logic is actually sort of duplicated with trilinear,
   1121        * since an overflow in one mip should also have a corresponding overflow
   1122        * in another.
   1123        */
   1124       /* should always have normalized coords, and offsets are undefined */
   1125       assert(bld->static_sampler_state->normalized_coords);
   1126       /*
   1127        * The coords should all be between [0,1] however we can have NaNs,
   1128        * which will wreak havoc. In particular the y1_clamped value below
   1129        * can be -INT_MAX (on x86) and be propagated right through (probably
   1130        * other values might be bogus in the end too).
   1131        * So kill off the NaNs here.
   1132        */
   1133       coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
   1134                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   1135       coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
   1136       /* instead of clamp, build mask if overflowed */
   1137       coord0 = lp_build_sub(coord_bld, coord0, half);
   1138       /* convert to int, compute lerp weight */
   1139       /* not ideal with AVX (and no AVX2) */
   1140       lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
   1141       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
   1142       coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
   1143                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   1144       coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
   1145       coord1 = lp_build_sub(coord_bld, coord1, half);
   1146       lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
   1147       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
   1148 
   1149       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
   1150       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
   1151       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
   1152       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
   1153 
   1154       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
   1155       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
   1156       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
   1157       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
   1158 
   1159       /* needed for accurate corner filtering branch later, rely on 0 init */
   1160       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
   1161       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
   1162 
   1163       for (texel_index = 0; texel_index < 4; texel_index++) {
   1164          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
   1165          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
   1166          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
   1167       }
   1168 
   1169       lp_build_if(&edge_if, bld->gallivm, have_edge);
   1170 
   1171       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
   1172       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
   1173       LLVMBuildStore(builder, have_corner, have_corners);
   1174 
   1175       /*
   1176        * Need to feed clamped values here for cheap corner handling,
   1177        * but only for y coord (as when falling off both edges we only
   1178        * fall off the x one) - this should be sufficient.
   1179        */
   1180       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
   1181       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
   1182 
   1183       /*
   1184        * Get all possible new coords.
   1185        */
   1186       lp_build_cube_new_coords(ivec_bld, face,
   1187                                x0, x1, y0_clamped, y1_clamped,
   1188                                length_minus_one,
   1189                                new_faces, new_xcoords, new_ycoords);
   1190 
   1191       /* handle fall off x-, x+ direction */
   1192       /* determine new coords, face (not both fall_off vars can be true at same time) */
   1193       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
   1194       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
   1195       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
   1196       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
   1197       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
   1198       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
   1199       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
   1200       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
   1201 
   1202       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
   1203       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
   1204 
   1205       /* handle fall off y-, y+ direction */
   1206       /*
   1207        * Cheap corner logic: just hack up things so a texel doesn't fall
   1208        * off both sides (which means filter weights will be wrong but we'll only
   1209        * use valid texels in the filter).
   1210        * This means however (y) coords must additionally be clamped (see above).
   1211        * This corner handling should be fully OpenGL (but not d3d10) compliant.
   1212        */
   1213       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
   1214       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
   1215       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
   1216       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
   1217 
   1218       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
   1219       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
   1220       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
   1221       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
   1222       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
   1223       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
   1224       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
   1225       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
   1226 
   1227       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
   1228       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
   1229       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
   1230       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
   1231 
   1232       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
   1233          /* now can add cube layer to face (per sample) */
   1234          z00 = lp_build_add(ivec_bld, z00, coords[3]);
   1235          z01 = lp_build_add(ivec_bld, z01, coords[3]);
   1236          z10 = lp_build_add(ivec_bld, z10, coords[3]);
   1237          z11 = lp_build_add(ivec_bld, z11, coords[3]);
   1238       }
   1239 
   1240       LLVMBuildStore(builder, x00, xs[0]);
   1241       LLVMBuildStore(builder, x01, xs[1]);
   1242       LLVMBuildStore(builder, x10, xs[2]);
   1243       LLVMBuildStore(builder, x11, xs[3]);
   1244       LLVMBuildStore(builder, y00, ys[0]);
   1245       LLVMBuildStore(builder, y01, ys[1]);
   1246       LLVMBuildStore(builder, y10, ys[2]);
   1247       LLVMBuildStore(builder, y11, ys[3]);
   1248       LLVMBuildStore(builder, z00, zs[0]);
   1249       LLVMBuildStore(builder, z01, zs[1]);
   1250       LLVMBuildStore(builder, z10, zs[2]);
   1251       LLVMBuildStore(builder, z11, zs[3]);
   1252 
   1253       lp_build_else(&edge_if);
   1254 
   1255       LLVMBuildStore(builder, x0, xs[0]);
   1256       LLVMBuildStore(builder, x1, xs[1]);
   1257       LLVMBuildStore(builder, x0, xs[2]);
   1258       LLVMBuildStore(builder, x1, xs[3]);
   1259       LLVMBuildStore(builder, y0, ys[0]);
   1260       LLVMBuildStore(builder, y0, ys[1]);
   1261       LLVMBuildStore(builder, y1, ys[2]);
   1262       LLVMBuildStore(builder, y1, ys[3]);
   1263       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
   1264          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
   1265          LLVMBuildStore(builder, cube_layer, zs[0]);
   1266          LLVMBuildStore(builder, cube_layer, zs[1]);
   1267          LLVMBuildStore(builder, cube_layer, zs[2]);
   1268          LLVMBuildStore(builder, cube_layer, zs[3]);
   1269       }
   1270       else {
   1271          LLVMBuildStore(builder, face, zs[0]);
   1272          LLVMBuildStore(builder, face, zs[1]);
   1273          LLVMBuildStore(builder, face, zs[2]);
   1274          LLVMBuildStore(builder, face, zs[3]);
   1275       }
   1276 
   1277       lp_build_endif(&edge_if);
   1278 
   1279       x00 = LLVMBuildLoad(builder, xs[0], "");
   1280       x01 = LLVMBuildLoad(builder, xs[1], "");
   1281       x10 = LLVMBuildLoad(builder, xs[2], "");
   1282       x11 = LLVMBuildLoad(builder, xs[3], "");
   1283       y00 = LLVMBuildLoad(builder, ys[0], "");
   1284       y01 = LLVMBuildLoad(builder, ys[1], "");
   1285       y10 = LLVMBuildLoad(builder, ys[2], "");
   1286       y11 = LLVMBuildLoad(builder, ys[3], "");
   1287       z00 = LLVMBuildLoad(builder, zs[0], "");
   1288       z01 = LLVMBuildLoad(builder, zs[1], "");
   1289       z10 = LLVMBuildLoad(builder, zs[2], "");
   1290       z11 = LLVMBuildLoad(builder, zs[3], "");
   1291    }
   1292 
   1293    if (linear_mask) {
   1294       /*
   1295        * Whack filter weights into place. Whatever texel had more weight is
   1296        * the one which should have been selected by nearest filtering hence
   1297        * just use 100% weight for it.
   1298        */
   1299       struct lp_build_context *c_bld = &bld->coord_bld;
   1300       LLVMValueRef w1_mask, w1_weight;
   1301       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
   1302 
   1303       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
   1304       /* this select is really just a "and" */
   1305       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
   1306       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
   1307       if (dims >= 2) {
   1308          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
   1309          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
   1310          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
   1311          if (dims == 3) {
   1312             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
   1313             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
   1314             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
   1315          }
   1316       }
   1317    }
   1318 
   1319    /*
   1320     * Get texture colors.
   1321     */
   1322    /* get x0/x1 texels */
   1323    lp_build_sample_texel_soa(bld,
   1324                              width_vec, height_vec, depth_vec,
   1325                              x00, y00, z00,
   1326                              row_stride_vec, img_stride_vec,
   1327                              data_ptr, mipoffsets, neighbors[0][0]);
   1328    lp_build_sample_texel_soa(bld,
   1329                              width_vec, height_vec, depth_vec,
   1330                              x01, y01, z01,
   1331                              row_stride_vec, img_stride_vec,
   1332                              data_ptr, mipoffsets, neighbors[0][1]);
   1333 
   1334    if (dims == 1) {
   1335       assert(!is_gather);
   1336       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
   1337          /* Interpolate two samples from 1D image to produce one color */
   1338          for (chan = 0; chan < 4; chan++) {
   1339             colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
   1340                                              neighbors[0][0][chan],
   1341                                              neighbors[0][1][chan],
   1342                                              0);
   1343          }
   1344       }
   1345       else {
   1346          LLVMValueRef cmpval0, cmpval1;
   1347          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
   1348          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
   1349          /* simplified lerp, AND mask with weight and add */
   1350          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
   1351                                            cmpval0, cmpval1);
   1352          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
   1353       }
   1354    }
   1355    else {
   1356       /* 2D/3D texture */
   1357       struct lp_build_if_state corner_if;
   1358       LLVMValueRef colors0[4], colorss[4];
   1359 
   1360       /* get x0/x1 texels at y1 */
   1361       lp_build_sample_texel_soa(bld,
   1362                                 width_vec, height_vec, depth_vec,
   1363                                 x10, y10, z10,
   1364                                 row_stride_vec, img_stride_vec,
   1365                                 data_ptr, mipoffsets, neighbors[1][0]);
   1366       lp_build_sample_texel_soa(bld,
   1367                                 width_vec, height_vec, depth_vec,
   1368                                 x11, y11, z11,
   1369                                 row_stride_vec, img_stride_vec,
   1370                                 data_ptr, mipoffsets, neighbors[1][1]);
   1371 
   1372       /*
   1373        * To avoid having to duplicate linear_mask / fetch code use
   1374        * another branch (with corner condition though edge would work
   1375        * as well) here.
   1376        */
   1377       if (accurate_cube_corners) {
   1378          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
   1379          LLVMValueRef have_corner, one_third;
   1380 
   1381          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
   1382          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
   1383          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
   1384          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
   1385 
   1386          have_corner = LLVMBuildLoad(builder, have_corners, "");
   1387 
   1388          lp_build_if(&corner_if, bld->gallivm, have_corner);
   1389 
   1390          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
   1391                                         1.0f/3.0f);
   1392 
   1393          /* find corner */
   1394          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
   1395          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
   1396          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
   1397          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
   1398          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
   1399          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
   1400          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
   1401          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
   1402 
   1403          if (!is_gather) {
   1404             /*
   1405              * we can't use standard 2d lerp as we need per-element weight
   1406              * in case of corners, so just calculate bilinear result as
   1407              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
   1408              * (This is actually less work than using 2d lerp, 7 vs. 9
   1409              * instructions, however calculating the weights needs another 6,
   1410              * so actually probably not slower than 2d lerp only for 4 channels
   1411              * as weights only need to be calculated once - of course fixing
   1412              * the weights has additional cost.)
   1413              */
   1414             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
   1415             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
   1416             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
   1417             w00 = lp_build_mul(coord_bld, wx0, wy0);
   1418             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
   1419             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
   1420             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
   1421 
   1422             /* find corner weight */
   1423             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
   1424             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
   1425             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
   1426             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
   1427 
   1428             /*
   1429              * add 1/3 of the corner weight to the weight of the 3 other
   1430              * samples and null out corner weight.
   1431              */
   1432             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
   1433             w00 = lp_build_add(coord_bld, w00, c_weight);
   1434             w00 = lp_build_andnot(coord_bld, w00, c00f);
   1435             w01 = lp_build_add(coord_bld, w01, c_weight);
   1436             w01 = lp_build_andnot(coord_bld, w01, c01f);
   1437             w10 = lp_build_add(coord_bld, w10, c_weight);
   1438             w10 = lp_build_andnot(coord_bld, w10, c10f);
   1439             w11 = lp_build_add(coord_bld, w11, c_weight);
   1440             w11 = lp_build_andnot(coord_bld, w11, c11f);
   1441 
   1442             if (bld->static_sampler_state->compare_mode ==
   1443                 PIPE_TEX_COMPARE_NONE) {
   1444                for (chan = 0; chan < 4; chan++) {
   1445                   colors0[chan] = lp_build_mul(coord_bld, w00,
   1446                                                neighbors[0][0][chan]);
   1447                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
   1448                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
   1449                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
   1450                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
   1451                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
   1452                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
   1453                }
   1454             }
   1455             else {
   1456                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
   1457                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
   1458                                                       neighbors[0][0][0]);
   1459                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
   1460                                                       neighbors[0][1][0]);
   1461                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
   1462                                                       neighbors[1][0][0]);
   1463                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
   1464                                                       neighbors[1][1][0]);
   1465                /*
   1466                 * inputs to interpolation are just masks so just add
   1467                 * masked weights together
   1468                 */
   1469                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
   1470                                            coord_bld->vec_type, "");
   1471                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
   1472                                            coord_bld->vec_type, "");
   1473                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
   1474                                            coord_bld->vec_type, "");
   1475                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
   1476                                            coord_bld->vec_type, "");
   1477                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
   1478                tmp = lp_build_and(coord_bld, w01, cmpval01);
   1479                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
   1480                tmp = lp_build_and(coord_bld, w10, cmpval10);
   1481                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
   1482                tmp = lp_build_and(coord_bld, w11, cmpval11);
   1483                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
   1484                colors0[1] = colors0[2] = colors0[3] = colors0[0];
   1485             }
   1486          }
   1487          else {
   1488             /*
   1489              * We don't have any weights to adjust, so instead calculate
   1490              * the fourth texel as simply the average of the other 3.
   1491              * (This would work for non-gather too, however we'd have
   1492              * a boatload more of the select stuff due to there being
   1493              * 4 times as many colors as weights.)
   1494              */
   1495             LLVMValueRef col00, col01, col10, col11;
   1496             LLVMValueRef colc, colc0, colc1;
   1497             col10 = lp_build_swizzle_soa_channel(texel_bld,
   1498                                                  neighbors[1][0], chan_swiz);
   1499             col11 = lp_build_swizzle_soa_channel(texel_bld,
   1500                                                  neighbors[1][1], chan_swiz);
   1501             col01 = lp_build_swizzle_soa_channel(texel_bld,
   1502                                                  neighbors[0][1], chan_swiz);
   1503             col00 = lp_build_swizzle_soa_channel(texel_bld,
   1504                                                  neighbors[0][0], chan_swiz);
   1505 
   1506             /*
   1507              * The spec says for comparison filtering, the comparison
   1508              * must happen before synthesizing the new value.
   1509              * This means all gathered values are always 0 or 1,
   1510              * except for the non-existing texel, which can be 0,1/3,2/3,1...
   1511              * Seems like we'd be allowed to just return 0 or 1 too, so we
   1512              * could simplify and pass down the compare mask values to the
   1513              * end (using int arithmetic/compare on the mask values to
   1514              * construct the fourth texel) and only there convert to floats
   1515              * but it's probably not worth it (it might be easier for the cpu
   1516              * but not for the code)...
   1517              */
   1518             if (bld->static_sampler_state->compare_mode !=
   1519                 PIPE_TEX_COMPARE_NONE) {
   1520                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
   1521                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
   1522                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
   1523                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
   1524                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
   1525                col00 = lp_build_select(texel_bld, cmpval00,
   1526                                        texel_bld->one, texel_bld->zero);
   1527                col01 = lp_build_select(texel_bld, cmpval01,
   1528                                        texel_bld->one, texel_bld->zero);
   1529                col10 = lp_build_select(texel_bld, cmpval10,
   1530                                        texel_bld->one, texel_bld->zero);
   1531                col11 = lp_build_select(texel_bld, cmpval11,
   1532                                        texel_bld->one, texel_bld->zero);
   1533             }
   1534 
   1535             /*
   1536              * Null out corner color.
   1537              */
   1538             col00 = lp_build_andnot(coord_bld, col00, c00f);
   1539             col01 = lp_build_andnot(coord_bld, col01, c01f);
   1540             col10 = lp_build_andnot(coord_bld, col10, c10f);
   1541             col11 = lp_build_andnot(coord_bld, col11, c11f);
   1542 
   1543             /*
   1544              * New corner texel color is all colors added / 3.
   1545              */
   1546             colc0 = lp_build_add(coord_bld, col00, col01);
   1547             colc1 = lp_build_add(coord_bld, col10, col11);
   1548             colc = lp_build_add(coord_bld, colc0, colc1);
   1549             colc = lp_build_mul(coord_bld, one_third, colc);
   1550 
   1551             /*
   1552              * Replace the corner texel color with the new value.
   1553              */
   1554             col00 = lp_build_select(coord_bld, c00, colc, col00);
   1555             col01 = lp_build_select(coord_bld, c01, colc, col01);
   1556             col10 = lp_build_select(coord_bld, c10, colc, col10);
   1557             col11 = lp_build_select(coord_bld, c11, colc, col11);
   1558 
   1559             colors0[0] = col10;
   1560             colors0[1] = col11;
   1561             colors0[2] = col01;
   1562             colors0[3] = col00;
   1563          }
   1564 
   1565          LLVMBuildStore(builder, colors0[0], colorss[0]);
   1566          LLVMBuildStore(builder, colors0[1], colorss[1]);
   1567          LLVMBuildStore(builder, colors0[2], colorss[2]);
   1568          LLVMBuildStore(builder, colors0[3], colorss[3]);
   1569 
   1570          lp_build_else(&corner_if);
   1571       }
   1572 
   1573       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
   1574          if (is_gather) {
   1575             /*
   1576              * Just assign the red channel (no component selection yet).
   1577              * This is a bit hackish, we usually do the swizzle at the
   1578              * end of sampling (much less values to swizzle), but this
   1579              * obviously cannot work when using gather.
   1580              */
   1581             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
   1582                                                       neighbors[1][0],
   1583                                                       chan_swiz);
   1584             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
   1585                                                       neighbors[1][1],
   1586                                                       chan_swiz);
   1587             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
   1588                                                       neighbors[0][1],
   1589                                                       chan_swiz);
   1590             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
   1591                                                       neighbors[0][0],
   1592                                                       chan_swiz);
   1593          }
   1594          else {
   1595             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
   1596             for (chan = 0; chan < 4; chan++) {
   1597                colors0[chan] = lp_build_lerp_2d(texel_bld,
   1598                                                 s_fpart, t_fpart,
   1599                                                 neighbors[0][0][chan],
   1600                                                 neighbors[0][1][chan],
   1601                                                 neighbors[1][0][chan],
   1602                                                 neighbors[1][1][chan],
   1603                                                 0);
   1604             }
   1605          }
   1606       }
   1607       else {
   1608          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
   1609          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
   1610          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
   1611          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
   1612          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
   1613 
   1614          if (is_gather) {
   1615             /* more hacks for swizzling, should be X, ONE or ZERO... */
   1616             colors0[0] = lp_build_select(texel_bld, cmpval10,
   1617                                          texel_bld->one, texel_bld->zero);
   1618             colors0[1] = lp_build_select(texel_bld, cmpval11,
   1619                                          texel_bld->one, texel_bld->zero);
   1620             colors0[2] = lp_build_select(texel_bld, cmpval01,
   1621                                          texel_bld->one, texel_bld->zero);
   1622             colors0[3] = lp_build_select(texel_bld, cmpval00,
   1623                                          texel_bld->one, texel_bld->zero);
   1624          }
   1625          else {
   1626             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
   1627                                              cmpval00, cmpval01, cmpval10, cmpval11);
   1628             colors0[1] = colors0[2] = colors0[3] = colors0[0];
   1629          }
   1630       }
   1631 
   1632       if (accurate_cube_corners) {
   1633          LLVMBuildStore(builder, colors0[0], colorss[0]);
   1634          LLVMBuildStore(builder, colors0[1], colorss[1]);
   1635          LLVMBuildStore(builder, colors0[2], colorss[2]);
   1636          LLVMBuildStore(builder, colors0[3], colorss[3]);
   1637 
   1638          lp_build_endif(&corner_if);
   1639 
   1640          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
   1641          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
   1642          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
   1643          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
   1644       }
   1645 
   1646       if (dims == 3) {
   1647          LLVMValueRef neighbors1[2][2][4];
   1648          LLVMValueRef colors1[4];
   1649 
   1650          assert(!is_gather);
   1651 
   1652          /* get x0/x1/y0/y1 texels at z1 */
   1653          lp_build_sample_texel_soa(bld,
   1654                                    width_vec, height_vec, depth_vec,
   1655                                    x00, y00, z1,
   1656                                    row_stride_vec, img_stride_vec,
   1657                                    data_ptr, mipoffsets, neighbors1[0][0]);
   1658          lp_build_sample_texel_soa(bld,
   1659                                    width_vec, height_vec, depth_vec,
   1660                                    x01, y01, z1,
   1661                                    row_stride_vec, img_stride_vec,
   1662                                    data_ptr, mipoffsets, neighbors1[0][1]);
   1663          lp_build_sample_texel_soa(bld,
   1664                                    width_vec, height_vec, depth_vec,
   1665                                    x10, y10, z1,
   1666                                    row_stride_vec, img_stride_vec,
   1667                                    data_ptr, mipoffsets, neighbors1[1][0]);
   1668          lp_build_sample_texel_soa(bld,
   1669                                    width_vec, height_vec, depth_vec,
   1670                                    x11, y11, z1,
   1671                                    row_stride_vec, img_stride_vec,
   1672                                    data_ptr, mipoffsets, neighbors1[1][1]);
   1673 
   1674          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
   1675             /* Bilinear interpolate the four samples from the second Z slice */
   1676             for (chan = 0; chan < 4; chan++) {
   1677                colors1[chan] = lp_build_lerp_2d(texel_bld,
   1678                                                 s_fpart, t_fpart,
   1679                                                 neighbors1[0][0][chan],
   1680                                                 neighbors1[0][1][chan],
   1681                                                 neighbors1[1][0][chan],
   1682                                                 neighbors1[1][1][chan],
   1683                                                 0);
   1684             }
   1685             /* Linearly interpolate the two samples from the two 3D slices */
   1686             for (chan = 0; chan < 4; chan++) {
   1687                colors_out[chan] = lp_build_lerp(texel_bld,
   1688                                                 r_fpart,
   1689                                                 colors0[chan], colors1[chan],
   1690                                                 0);
   1691             }
   1692          }
   1693          else {
   1694             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
   1695             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
   1696             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
   1697             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
   1698             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
   1699             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
   1700                                              cmpval00, cmpval01, cmpval10, cmpval11);
   1701             /* Linearly interpolate the two samples from the two 3D slices */
   1702             colors_out[0] = lp_build_lerp(texel_bld,
   1703                                           r_fpart,
   1704                                           colors0[0], colors1[0],
   1705                                           0);
   1706             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
   1707          }
   1708       }
   1709       else {
   1710          /* 2D tex */
   1711          for (chan = 0; chan < 4; chan++) {
   1712             colors_out[chan] = colors0[chan];
   1713          }
   1714       }
   1715    }
   1716    if (is_gather) {
   1717       /*
   1718        * For gather, we can't do our usual channel swizzling done later,
   1719        * so do it here. It only really matters for 0/1 swizzles in case
   1720        * of comparison filtering, since in this case the results would be
   1721        * wrong, without comparison it should all work out alright but it
   1722        * can't hurt to do that here, since it will instantly drop all
   1723        * calculations above, though it's a rather stupid idea to do
   1724        * gather on a channel which will always return 0 or 1 in any case...
   1725        */
   1726       if (chan_swiz == PIPE_SWIZZLE_1) {
   1727          for (chan = 0; chan < 4; chan++) {
   1728             colors_out[chan] = texel_bld->one;
   1729          }
   1730       } else if (chan_swiz == PIPE_SWIZZLE_0) {
   1731          for (chan = 0; chan < 4; chan++) {
   1732             colors_out[chan] = texel_bld->zero;
   1733          }
   1734       }
   1735    }
   1736 }
   1737 
   1738 
   1739 /**
   1740  * Sample the texture/mipmap using given image filter and mip filter.
   1741  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
   1742  * from (vectors or scalars).
   1743  * If we're using nearest miplevel sampling the '1' values will be null/unused.
   1744  */
   1745 static void
   1746 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
   1747                        unsigned img_filter,
   1748                        unsigned mip_filter,
   1749                        boolean is_gather,
   1750                        const LLVMValueRef *coords,
   1751                        const LLVMValueRef *offsets,
   1752                        LLVMValueRef ilevel0,
   1753                        LLVMValueRef ilevel1,
   1754                        LLVMValueRef lod_fpart,
   1755                        LLVMValueRef *colors_out)
   1756 {
   1757    LLVMBuilderRef builder = bld->gallivm->builder;
   1758    LLVMValueRef size0 = NULL;
   1759    LLVMValueRef size1 = NULL;
   1760    LLVMValueRef row_stride0_vec = NULL;
   1761    LLVMValueRef row_stride1_vec = NULL;
   1762    LLVMValueRef img_stride0_vec = NULL;
   1763    LLVMValueRef img_stride1_vec = NULL;
   1764    LLVMValueRef data_ptr0 = NULL;
   1765    LLVMValueRef data_ptr1 = NULL;
   1766    LLVMValueRef mipoff0 = NULL;
   1767    LLVMValueRef mipoff1 = NULL;
   1768    LLVMValueRef colors0[4], colors1[4];
   1769    unsigned chan;
   1770 
   1771    /* sample the first mipmap level */
   1772    lp_build_mipmap_level_sizes(bld, ilevel0,
   1773                                &size0,
   1774                                &row_stride0_vec, &img_stride0_vec);
   1775    if (bld->num_mips == 1) {
   1776       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
   1777    }
   1778    else {
   1779       /* This path should work for num_lods 1 too but slightly less efficient */
   1780       data_ptr0 = bld->base_ptr;
   1781       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
   1782    }
   1783    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
   1784       lp_build_sample_image_nearest(bld, size0,
   1785                                     row_stride0_vec, img_stride0_vec,
   1786                                     data_ptr0, mipoff0, coords, offsets,
   1787                                     colors0);
   1788    }
   1789    else {
   1790       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
   1791       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
   1792                                    row_stride0_vec, img_stride0_vec,
   1793                                    data_ptr0, mipoff0, coords, offsets,
   1794                                    colors0);
   1795    }
   1796 
   1797    /* Store the first level's colors in the output variables */
   1798    for (chan = 0; chan < 4; chan++) {
   1799        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
   1800    }
   1801 
   1802    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
   1803       struct lp_build_if_state if_ctx;
   1804       LLVMValueRef need_lerp;
   1805 
   1806       /* need_lerp = lod_fpart > 0 */
   1807       if (bld->num_lods == 1) {
   1808          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
   1809                                    lod_fpart, bld->lodf_bld.zero,
   1810                                    "need_lerp");
   1811       }
   1812       else {
   1813          /*
   1814           * We'll do mip filtering if any of the quads (or individual
   1815           * pixel in case of per-pixel lod) need it.
   1816           * It might be better to split the vectors here and only fetch/filter
   1817           * quads which need it (if there's one lod per quad).
   1818           */
   1819          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
   1820                                       PIPE_FUNC_GREATER,
   1821                                       lod_fpart, bld->lodf_bld.zero);
   1822          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
   1823          lp_build_name(need_lerp, "need_lerp");
   1824       }
   1825 
   1826       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
   1827       {
   1828          /*
   1829           * We unfortunately need to clamp lod_fpart here since we can get
   1830           * negative values which would screw up filtering if not all
   1831           * lod_fpart values have same sign.
   1832           */
   1833          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
   1834                                   bld->lodf_bld.zero);
   1835          /* sample the second mipmap level */
   1836          lp_build_mipmap_level_sizes(bld, ilevel1,
   1837                                      &size1,
   1838                                      &row_stride1_vec, &img_stride1_vec);
   1839          if (bld->num_mips == 1) {
   1840             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
   1841          }
   1842          else {
   1843             data_ptr1 = bld->base_ptr;
   1844             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
   1845          }
   1846          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
   1847             lp_build_sample_image_nearest(bld, size1,
   1848                                           row_stride1_vec, img_stride1_vec,
   1849                                           data_ptr1, mipoff1, coords, offsets,
   1850                                           colors1);
   1851          }
   1852          else {
   1853             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
   1854                                          row_stride1_vec, img_stride1_vec,
   1855                                          data_ptr1, mipoff1, coords, offsets,
   1856                                          colors1);
   1857          }
   1858 
   1859          /* interpolate samples from the two mipmap levels */
   1860 
   1861          if (bld->num_lods != bld->coord_type.length)
   1862             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
   1863                                                               bld->lodf_bld.type,
   1864                                                               bld->texel_bld.type,
   1865                                                               lod_fpart);
   1866 
   1867          for (chan = 0; chan < 4; chan++) {
   1868             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
   1869                                           colors0[chan], colors1[chan],
   1870                                           0);
   1871             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
   1872          }
   1873       }
   1874       lp_build_endif(&if_ctx);
   1875    }
   1876 }
   1877 
   1878 
   1879 /**
   1880  * Sample the texture/mipmap using given mip filter, and using
   1881  * both nearest and linear filtering at the same time depending
   1882  * on linear_mask.
   1883  * lod can be per quad but linear_mask is always per pixel.
   1884  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
   1885  * from (vectors or scalars).
   1886  * If we're using nearest miplevel sampling the '1' values will be null/unused.
   1887  */
   1888 static void
   1889 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
   1890                             LLVMValueRef linear_mask,
   1891                             unsigned mip_filter,
   1892                             const LLVMValueRef *coords,
   1893                             const LLVMValueRef *offsets,
   1894                             LLVMValueRef ilevel0,
   1895                             LLVMValueRef ilevel1,
   1896                             LLVMValueRef lod_fpart,
   1897                             LLVMValueRef lod_positive,
   1898                             LLVMValueRef *colors_out)
   1899 {
   1900    LLVMBuilderRef builder = bld->gallivm->builder;
   1901    LLVMValueRef size0 = NULL;
   1902    LLVMValueRef size1 = NULL;
   1903    LLVMValueRef row_stride0_vec = NULL;
   1904    LLVMValueRef row_stride1_vec = NULL;
   1905    LLVMValueRef img_stride0_vec = NULL;
   1906    LLVMValueRef img_stride1_vec = NULL;
   1907    LLVMValueRef data_ptr0 = NULL;
   1908    LLVMValueRef data_ptr1 = NULL;
   1909    LLVMValueRef mipoff0 = NULL;
   1910    LLVMValueRef mipoff1 = NULL;
   1911    LLVMValueRef colors0[4], colors1[4];
   1912    unsigned chan;
   1913 
   1914    /* sample the first mipmap level */
   1915    lp_build_mipmap_level_sizes(bld, ilevel0,
   1916                                &size0,
   1917                                &row_stride0_vec, &img_stride0_vec);
   1918    if (bld->num_mips == 1) {
   1919       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
   1920    }
   1921    else {
   1922       /* This path should work for num_lods 1 too but slightly less efficient */
   1923       data_ptr0 = bld->base_ptr;
   1924       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
   1925    }
   1926 
   1927    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
   1928                                 row_stride0_vec, img_stride0_vec,
   1929                                 data_ptr0, mipoff0, coords, offsets,
   1930                                 colors0);
   1931 
   1932    /* Store the first level's colors in the output variables */
   1933    for (chan = 0; chan < 4; chan++) {
   1934        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
   1935    }
   1936 
   1937    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
   1938       struct lp_build_if_state if_ctx;
   1939       LLVMValueRef need_lerp;
   1940 
   1941       /*
   1942        * We'll do mip filtering if any of the quads (or individual
   1943        * pixel in case of per-pixel lod) need it.
   1944        * Note using lod_positive here not lod_fpart since it may be the same
   1945        * condition as that used in the outer "if" in the caller hence llvm
   1946        * should be able to merge the branches in this case.
   1947        */
   1948       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
   1949       lp_build_name(need_lerp, "need_lerp");
   1950 
   1951       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
   1952       {
   1953          /*
   1954           * We unfortunately need to clamp lod_fpart here since we can get
   1955           * negative values which would screw up filtering if not all
   1956           * lod_fpart values have same sign.
   1957           */
   1958          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
   1959                                   bld->lodf_bld.zero);
   1960          /* sample the second mipmap level */
   1961          lp_build_mipmap_level_sizes(bld, ilevel1,
   1962                                      &size1,
   1963                                      &row_stride1_vec, &img_stride1_vec);
   1964          if (bld->num_mips == 1) {
   1965             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
   1966          }
   1967          else {
   1968             data_ptr1 = bld->base_ptr;
   1969             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
   1970          }
   1971 
   1972          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
   1973                                       row_stride1_vec, img_stride1_vec,
   1974                                       data_ptr1, mipoff1, coords, offsets,
   1975                                       colors1);
   1976 
   1977          /* interpolate samples from the two mipmap levels */
   1978 
   1979          if (bld->num_lods != bld->coord_type.length)
   1980             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
   1981                                                               bld->lodf_bld.type,
   1982                                                               bld->texel_bld.type,
   1983                                                               lod_fpart);
   1984 
   1985          for (chan = 0; chan < 4; chan++) {
   1986             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
   1987                                           colors0[chan], colors1[chan],
   1988                                           0);
   1989             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
   1990          }
   1991       }
   1992       lp_build_endif(&if_ctx);
   1993    }
   1994 }
   1995 
   1996 
   1997 /**
   1998  * Build (per-coord) layer value.
   1999  * Either clamp layer to valid values or fill in optional out_of_bounds
   2000  * value and just return value unclamped.
   2001  */
   2002 static LLVMValueRef
   2003 lp_build_layer_coord(struct lp_build_sample_context *bld,
   2004                      unsigned texture_unit,
   2005                      boolean is_cube_array,
   2006                      LLVMValueRef layer,
   2007                      LLVMValueRef *out_of_bounds)
   2008 {
   2009    LLVMValueRef num_layers;
   2010    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
   2011 
   2012    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
   2013                                           bld->context_ptr, texture_unit);
   2014 
   2015    if (out_of_bounds) {
   2016       LLVMValueRef out1, out;
   2017       assert(!is_cube_array);
   2018       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
   2019       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
   2020       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
   2021       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
   2022       return layer;
   2023    }
   2024    else {
   2025       LLVMValueRef maxlayer;
   2026       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
   2027                                        bld->int_bld.one;
   2028       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
   2029       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
   2030       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
   2031    }
   2032 }
   2033 
   2034 
   2035 /**
   2036  * Calculate cube face, lod, mip levels.
   2037  */
   2038 static void
   2039 lp_build_sample_common(struct lp_build_sample_context *bld,
   2040                        boolean is_lodq,
   2041                        unsigned texture_index,
   2042                        unsigned sampler_index,
   2043                        LLVMValueRef *coords,
   2044                        const struct lp_derivatives *derivs, /* optional */
   2045                        LLVMValueRef lod_bias, /* optional */
   2046                        LLVMValueRef explicit_lod, /* optional */
   2047                        LLVMValueRef *lod_pos_or_zero,
   2048                        LLVMValueRef *lod,
   2049                        LLVMValueRef *lod_fpart,
   2050                        LLVMValueRef *ilevel0,
   2051                        LLVMValueRef *ilevel1)
   2052 {
   2053    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
   2054    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
   2055    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
   2056    const unsigned target = bld->static_texture_state->target;
   2057    LLVMValueRef first_level, cube_rho = NULL;
   2058    LLVMValueRef lod_ipart = NULL;
   2059    struct lp_derivatives cube_derivs;
   2060 
   2061    /*
   2062    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
   2063           mip_filter, min_filter, mag_filter);
   2064    */
   2065 
   2066    /*
   2067     * Choose cube face, recompute texcoords for the chosen face and
   2068     * compute rho here too (as it requires transform of derivatives).
   2069     */
   2070    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
   2071       boolean need_derivs;
   2072       need_derivs = ((min_filter != mag_filter ||
   2073                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
   2074                       !bld->static_sampler_state->min_max_lod_equal &&
   2075                       !explicit_lod);
   2076       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
   2077       derivs = &cube_derivs;
   2078       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
   2079          /* calculate cube layer coord now */
   2080          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
   2081          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
   2082          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
   2083          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
   2084          /* because of seamless filtering can't add it to face (coords[2]) here. */
   2085       }
   2086    }
   2087    else if (target == PIPE_TEXTURE_1D_ARRAY ||
   2088             target == PIPE_TEXTURE_2D_ARRAY) {
   2089       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
   2090       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
   2091    }
   2092 
   2093    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
   2094       /*
   2095        * Clamp p coords to [0,1] for fixed function depth texture format here.
   2096        * Technically this is not entirely correct for unorm depth as the ref value
   2097        * should be converted to the depth format (quantization!) and comparison
   2098        * then done in texture format. This would actually help performance (since
   2099        * only need to do it once and could save the per-sample conversion of texels
   2100        * to floats instead), but it would need more messy code (would need to push
   2101        * at least some bits down to actual fetch so conversion could be skipped,
   2102        * and would have ugly interaction with border color, would need to convert
   2103        * border color to that format too or do some other tricks to make it work).
   2104        */
   2105       const struct util_format_description *format_desc = bld->format_desc;
   2106       unsigned chan_type;
   2107       /* not entirely sure we couldn't end up with non-valid swizzle here */
   2108       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
   2109                      format_desc->channel[format_desc->swizzle[0]].type :
   2110                      UTIL_FORMAT_TYPE_FLOAT;
   2111       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
   2112          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
   2113                                     bld->coord_bld.zero, bld->coord_bld.one);
   2114       }
   2115    }
   2116 
   2117    /*
   2118     * Compute the level of detail (float).
   2119     */
   2120    if (min_filter != mag_filter ||
   2121        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
   2122       /* Need to compute lod either to choose mipmap levels or to
   2123        * distinguish between minification/magnification with one mipmap level.
   2124        */
   2125       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
   2126                             coords[0], coords[1], coords[2], cube_rho,
   2127                             derivs, lod_bias, explicit_lod,
   2128                             mip_filter, lod,
   2129                             &lod_ipart, lod_fpart, lod_pos_or_zero);
   2130       if (is_lodq) {
   2131          LLVMValueRef last_level;
   2132          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
   2133                                                      bld->gallivm,
   2134                                                      bld->context_ptr,
   2135                                                      texture_index);
   2136          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
   2137                                                        bld->gallivm,
   2138                                                        bld->context_ptr,
   2139                                                        texture_index);
   2140          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
   2141          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
   2142          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
   2143 
   2144          switch (mip_filter) {
   2145          case PIPE_TEX_MIPFILTER_NONE:
   2146             *lod_fpart = bld->lodf_bld.zero;
   2147             break;
   2148          case PIPE_TEX_MIPFILTER_NEAREST:
   2149              *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
   2150              /* fallthrough */
   2151          case PIPE_TEX_MIPFILTER_LINEAR:
   2152             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
   2153                                         bld->lodf_bld.zero, last_level);
   2154             break;
   2155          }
   2156          return;
   2157       }
   2158 
   2159    } else {
   2160       lod_ipart = bld->lodi_bld.zero;
   2161       *lod_pos_or_zero = bld->lodi_bld.zero;
   2162    }
   2163 
   2164    if (bld->num_lods != bld->num_mips) {
   2165       /* only makes sense if there's just a single mip level */
   2166       assert(bld->num_mips == 1);
   2167       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
   2168    }
   2169 
   2170    /*
   2171     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
   2172     */
   2173    switch (mip_filter) {
   2174    default:
   2175       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
   2176       /* fall-through */
   2177    case PIPE_TEX_MIPFILTER_NONE:
   2178       /* always use mip level 0 */
   2179       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
   2180                                                     bld->gallivm, bld->context_ptr,
   2181                                                     texture_index);
   2182       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
   2183       *ilevel0 = first_level;
   2184       break;
   2185    case PIPE_TEX_MIPFILTER_NEAREST:
   2186       assert(lod_ipart);
   2187       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
   2188       break;
   2189    case PIPE_TEX_MIPFILTER_LINEAR:
   2190       assert(lod_ipart);
   2191       assert(*lod_fpart);
   2192       lp_build_linear_mip_levels(bld, texture_index,
   2193                                  lod_ipart, lod_fpart,
   2194                                  ilevel0, ilevel1);
   2195       break;
   2196    }
   2197 }
   2198 
   2199 static void
   2200 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
   2201                             unsigned sampler_unit)
   2202 {
   2203    struct gallivm_state *gallivm = bld->gallivm;
   2204    LLVMBuilderRef builder = gallivm->builder;
   2205    LLVMValueRef border_color_ptr =
   2206       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
   2207                                        bld->context_ptr, sampler_unit);
   2208    LLVMValueRef border_color;
   2209    const struct util_format_description *format_desc = bld->format_desc;
   2210    struct lp_type vec4_type = bld->texel_type;
   2211    struct lp_build_context vec4_bld;
   2212    LLVMValueRef min_clamp = NULL;
   2213    LLVMValueRef max_clamp = NULL;
   2214 
   2215    /*
   2216     * For normalized format need to clamp border color (technically
   2217     * probably should also quantize the data). Really sucks doing this
   2218     * here but can't avoid at least for now since this is part of
   2219     * sampler state and texture format is part of sampler_view state.
   2220     * GL expects also expects clamping for uint/sint formats too so
   2221     * do that as well (d3d10 can't end up here with uint/sint since it
   2222     * only supports them with ld).
   2223     */
   2224    vec4_type.length = 4;
   2225    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
   2226 
   2227    /*
   2228     * Vectorized clamping of border color. Loading is a bit of a hack since
   2229     * we just cast the pointer to float array to pointer to vec4
   2230     * (int or float).
   2231     */
   2232    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
   2233                                              lp_build_const_int32(gallivm, 0));
   2234    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
   2235                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
   2236    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
   2237    /* we don't have aligned type in the dynamic state unfortunately */
   2238    LLVMSetAlignment(border_color, 4);
   2239 
   2240    /*
   2241     * Instead of having some incredibly complex logic which will try to figure out
   2242     * clamping necessary for each channel, simply use the first channel, and treat
   2243     * mixed signed/unsigned normalized formats specially.
   2244     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
   2245     * good reason.)
   2246     */
   2247    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
   2248       int chan;
   2249       /* d/s needs special handling because both present means just sampling depth */
   2250       if (util_format_is_depth_and_stencil(format_desc->format)) {
   2251          chan = format_desc->swizzle[0];
   2252       }
   2253       else {
   2254          chan = util_format_get_first_non_void_channel(format_desc->format);
   2255       }
   2256       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
   2257          unsigned chan_type = format_desc->channel[chan].type;
   2258          unsigned chan_norm = format_desc->channel[chan].normalized;
   2259          unsigned chan_pure = format_desc->channel[chan].pure_integer;
   2260          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
   2261             if (chan_norm) {
   2262                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
   2263                max_clamp = vec4_bld.one;
   2264             }
   2265             else if (chan_pure) {
   2266                /*
   2267                 * Border color was stored as int, hence need min/max clamp
   2268                 * only if chan has less than 32 bits..
   2269                 */
   2270                unsigned chan_size = format_desc->channel[chan].size;
   2271                if (chan_size < 32) {
   2272                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
   2273                                                      0 - (1 << (chan_size - 1)));
   2274                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
   2275                                                      (1 << (chan_size - 1)) - 1);
   2276                }
   2277             }
   2278             /* TODO: no idea about non-pure, non-normalized! */
   2279          }
   2280          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
   2281             if (chan_norm) {
   2282                min_clamp = vec4_bld.zero;
   2283                max_clamp = vec4_bld.one;
   2284             }
   2285             /*
   2286              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
   2287              * we use Z32_FLOAT_S8X24 to imply sampling depth component
   2288              * and ignoring stencil, which will blow up here if we try to
   2289              * do a uint clamp in a float texel build...
   2290              * And even if we had that format, mesa st also thinks using z24s8
   2291              * means depth sampling ignoring stencil.
   2292              */
   2293             else if (chan_pure) {
   2294                /*
   2295                 * Border color was stored as uint, hence never need min
   2296                 * clamp, and only need max clamp if chan has less than 32 bits.
   2297                 */
   2298                unsigned chan_size = format_desc->channel[chan].size;
   2299                if (chan_size < 32) {
   2300                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
   2301                                                      (1 << chan_size) - 1);
   2302                }
   2303                /* TODO: no idea about non-pure, non-normalized! */
   2304             }
   2305          }
   2306          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
   2307             /* TODO: I have no idea what clamp this would need if any! */
   2308          }
   2309       }
   2310       /* mixed plain formats (or different pure size) */
   2311       switch (format_desc->format) {
   2312       case PIPE_FORMAT_B10G10R10A2_UINT:
   2313       case PIPE_FORMAT_R10G10B10A2_UINT:
   2314       {
   2315          unsigned max10 = (1 << 10) - 1;
   2316          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
   2317                                         max10, (1 << 2) - 1, NULL);
   2318       }
   2319          break;
   2320       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
   2321          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
   2322                                         -1.0F, 0.0F, NULL);
   2323          max_clamp = vec4_bld.one;
   2324          break;
   2325       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
   2326       case PIPE_FORMAT_R5SG5SB6U_NORM:
   2327          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
   2328                                         0.0F, 0.0F, NULL);
   2329          max_clamp = vec4_bld.one;
   2330          break;
   2331       default:
   2332          break;
   2333       }
   2334    }
   2335    else {
   2336       /* cannot figure this out from format description */
   2337       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
   2338          /* s3tc formats are always unorm */
   2339          min_clamp = vec4_bld.zero;
   2340          max_clamp = vec4_bld.one;
   2341       }
   2342       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
   2343                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
   2344          switch (format_desc->format) {
   2345          case PIPE_FORMAT_RGTC1_UNORM:
   2346          case PIPE_FORMAT_RGTC2_UNORM:
   2347          case PIPE_FORMAT_LATC1_UNORM:
   2348          case PIPE_FORMAT_LATC2_UNORM:
   2349          case PIPE_FORMAT_ETC1_RGB8:
   2350             min_clamp = vec4_bld.zero;
   2351             max_clamp = vec4_bld.one;
   2352             break;
   2353          case PIPE_FORMAT_RGTC1_SNORM:
   2354          case PIPE_FORMAT_RGTC2_SNORM:
   2355          case PIPE_FORMAT_LATC1_SNORM:
   2356          case PIPE_FORMAT_LATC2_SNORM:
   2357             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
   2358             max_clamp = vec4_bld.one;
   2359             break;
   2360          default:
   2361             assert(0);
   2362             break;
   2363          }
   2364       }
   2365       /*
   2366        * all others from subsampled/other group, though we don't care
   2367        * about yuv (and should not have any from zs here)
   2368        */
   2369       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
   2370          switch (format_desc->format) {
   2371          case PIPE_FORMAT_R8G8_B8G8_UNORM:
   2372          case PIPE_FORMAT_G8R8_G8B8_UNORM:
   2373          case PIPE_FORMAT_G8R8_B8R8_UNORM:
   2374          case PIPE_FORMAT_R8G8_R8B8_UNORM:
   2375          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
   2376             min_clamp = vec4_bld.zero;
   2377             max_clamp = vec4_bld.one;
   2378             break;
   2379          case PIPE_FORMAT_R8G8Bx_SNORM:
   2380             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
   2381             max_clamp = vec4_bld.one;
   2382             break;
   2383             /*
   2384              * Note smallfloat formats usually don't need clamping
   2385              * (they still have infinite range) however this is not
   2386              * true for r11g11b10 and r9g9b9e5, which can't represent
   2387              * negative numbers (and additionally r9g9b9e5 can't represent
   2388              * very large numbers). d3d10 seems happy without clamping in
   2389              * this case, but gl spec is pretty clear: "for floating
   2390              * point and integer formats, border values are clamped to
   2391              * the representable range of the format" so do that here.
   2392              */
   2393          case PIPE_FORMAT_R11G11B10_FLOAT:
   2394             min_clamp = vec4_bld.zero;
   2395             break;
   2396          case PIPE_FORMAT_R9G9B9E5_FLOAT:
   2397             min_clamp = vec4_bld.zero;
   2398             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
   2399             break;
   2400          default:
   2401             assert(0);
   2402             break;
   2403          }
   2404       }
   2405    }
   2406 
   2407    if (min_clamp) {
   2408       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
   2409    }
   2410    if (max_clamp) {
   2411       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
   2412    }
   2413 
   2414    bld->border_color_clamped = border_color;
   2415 }
   2416 
   2417 
   2418 /**
   2419  * General texture sampling codegen.
   2420  * This function handles texture sampling for all texture targets (1D,
   2421  * 2D, 3D, cube) and all filtering modes.
   2422  */
   2423 static void
   2424 lp_build_sample_general(struct lp_build_sample_context *bld,
   2425                         unsigned sampler_unit,
   2426                         boolean is_gather,
   2427                         const LLVMValueRef *coords,
   2428                         const LLVMValueRef *offsets,
   2429                         LLVMValueRef lod_positive,
   2430                         LLVMValueRef lod_fpart,
   2431                         LLVMValueRef ilevel0,
   2432                         LLVMValueRef ilevel1,
   2433                         LLVMValueRef *colors_out)
   2434 {
   2435    LLVMBuilderRef builder = bld->gallivm->builder;
   2436    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
   2437    const unsigned mip_filter = sampler_state->min_mip_filter;
   2438    const unsigned min_filter = sampler_state->min_img_filter;
   2439    const unsigned mag_filter = sampler_state->mag_img_filter;
   2440    LLVMValueRef texels[4];
   2441    unsigned chan;
   2442 
   2443    /* if we need border color, (potentially) clamp it now */
   2444    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
   2445                                               min_filter,
   2446                                               mag_filter) ||
   2447        (bld->dims > 1 &&
   2448            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
   2449                                                   min_filter,
   2450                                                   mag_filter)) ||
   2451        (bld->dims > 2 &&
   2452            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
   2453                                                   min_filter,
   2454                                                   mag_filter))) {
   2455       lp_build_clamp_border_color(bld, sampler_unit);
   2456    }
   2457 
   2458 
   2459    /*
   2460     * Get/interpolate texture colors.
   2461     */
   2462 
   2463    for (chan = 0; chan < 4; ++chan) {
   2464      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
   2465      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
   2466    }
   2467 
   2468    if (min_filter == mag_filter) {
   2469       /* no need to distinguish between minification and magnification */
   2470       lp_build_sample_mipmap(bld, min_filter, mip_filter,
   2471                              is_gather,
   2472                              coords, offsets,
   2473                              ilevel0, ilevel1, lod_fpart,
   2474                              texels);
   2475    }
   2476    else {
   2477       /*
   2478        * Could also get rid of the if-logic and always use mipmap_both, both
   2479        * for the single lod and multi-lod case if nothing really uses this.
   2480        */
   2481       if (bld->num_lods == 1) {
   2482          /* Emit conditional to choose min image filter or mag image filter
   2483           * depending on the lod being > 0 or <= 0, respectively.
   2484           */
   2485          struct lp_build_if_state if_ctx;
   2486 
   2487          lod_positive = LLVMBuildTrunc(builder, lod_positive,
   2488                                        LLVMInt1TypeInContext(bld->gallivm->context),
   2489                                        "lod_pos");
   2490 
   2491          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
   2492          {
   2493             /* Use the minification filter */
   2494             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
   2495                                    coords, offsets,
   2496                                    ilevel0, ilevel1, lod_fpart,
   2497                                    texels);
   2498          }
   2499          lp_build_else(&if_ctx);
   2500          {
   2501             /* Use the magnification filter */
   2502             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
   2503                                    FALSE,
   2504                                    coords, offsets,
   2505                                    ilevel0, NULL, NULL,
   2506                                    texels);
   2507          }
   2508          lp_build_endif(&if_ctx);
   2509       }
   2510       else {
   2511          LLVMValueRef need_linear, linear_mask;
   2512          unsigned mip_filter_for_nearest;
   2513          struct lp_build_if_state if_ctx;
   2514 
   2515          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
   2516             linear_mask = lod_positive;
   2517             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
   2518          }
   2519          else {
   2520             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
   2521             mip_filter_for_nearest = mip_filter;
   2522          }
   2523          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
   2524                                                linear_mask);
   2525          lp_build_name(need_linear, "need_linear");
   2526 
   2527          if (bld->num_lods != bld->coord_type.length) {
   2528             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
   2529                                                                 bld->lodi_type,
   2530                                                                 bld->int_coord_type,
   2531                                                                 linear_mask);
   2532          }
   2533 
   2534          lp_build_if(&if_ctx, bld->gallivm, need_linear);
   2535          {
   2536             /*
   2537              * Do sampling with both filters simultaneously. This means using
   2538              * a linear filter and doing some tricks (with weights) for the pixels
   2539              * which need nearest filter.
   2540              * Note that it's probably rare some pixels need nearest and some
   2541              * linear filter but the fixups required for the nearest pixels
   2542              * aren't all that complicated so just always run a combined path
   2543              * if at least some pixels require linear.
   2544              */
   2545             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
   2546                                         coords, offsets,
   2547                                         ilevel0, ilevel1,
   2548                                         lod_fpart, lod_positive,
   2549                                         texels);
   2550          }
   2551          lp_build_else(&if_ctx);
   2552          {
   2553             /*
   2554              * All pixels require just nearest filtering, which is way
   2555              * cheaper than linear, hence do a separate path for that.
   2556              */
   2557             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
   2558                                    mip_filter_for_nearest, FALSE,
   2559                                    coords, offsets,
   2560                                    ilevel0, ilevel1, lod_fpart,
   2561                                    texels);
   2562          }
   2563          lp_build_endif(&if_ctx);
   2564       }
   2565    }
   2566 
   2567    for (chan = 0; chan < 4; ++chan) {
   2568      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
   2569      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
   2570    }
   2571 }
   2572 
   2573 
   2574 /**
   2575  * Texel fetch function.
   2576  * In contrast to general sampling there is no filtering, no coord minification,
   2577  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
   2578  * directly to be applied to the selected mip level (after adding texel offsets).
   2579  * This function handles texel fetch for all targets where texel fetch is supported
   2580  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
   2581  */
   2582 static void
   2583 lp_build_fetch_texel(struct lp_build_sample_context *bld,
   2584                      unsigned texture_unit,
   2585                      const LLVMValueRef *coords,
   2586                      LLVMValueRef explicit_lod,
   2587                      const LLVMValueRef *offsets,
   2588                      LLVMValueRef *colors_out)
   2589 {
   2590    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
   2591    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
   2592    unsigned dims = bld->dims, chan;
   2593    unsigned target = bld->static_texture_state->target;
   2594    boolean out_of_bound_ret_zero = TRUE;
   2595    LLVMValueRef size, ilevel;
   2596    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
   2597    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
   2598    LLVMValueRef width, height, depth, i, j;
   2599    LLVMValueRef offset, out_of_bounds, out1;
   2600 
   2601    out_of_bounds = int_coord_bld->zero;
   2602 
   2603    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
   2604       if (bld->num_mips != int_coord_bld->type.length) {
   2605          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
   2606                                             perquadi_bld->type, explicit_lod, 0);
   2607       }
   2608       else {
   2609          ilevel = explicit_lod;
   2610       }
   2611       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
   2612                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
   2613    }
   2614    else {
   2615       assert(bld->num_mips == 1);
   2616       if (bld->static_texture_state->target != PIPE_BUFFER) {
   2617          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
   2618                                                   bld->context_ptr, texture_unit);
   2619       }
   2620       else {
   2621          ilevel = lp_build_const_int32(bld->gallivm, 0);
   2622       }
   2623    }
   2624    lp_build_mipmap_level_sizes(bld, ilevel,
   2625                                &size,
   2626                                &row_stride_vec, &img_stride_vec);
   2627    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
   2628                                 size, &width, &height, &depth);
   2629 
   2630    if (target == PIPE_TEXTURE_1D_ARRAY ||
   2631        target == PIPE_TEXTURE_2D_ARRAY) {
   2632       if (out_of_bound_ret_zero) {
   2633          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
   2634          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2635       }
   2636       else {
   2637          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
   2638       }
   2639    }
   2640 
   2641    /* This is a lot like border sampling */
   2642    if (offsets[0]) {
   2643       /*
   2644        * coords are really unsigned, offsets are signed, but I don't think
   2645        * exceeding 31 bits is possible
   2646        */
   2647       x = lp_build_add(int_coord_bld, x, offsets[0]);
   2648    }
   2649    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
   2650    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2651    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
   2652    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2653 
   2654    if (dims >= 2) {
   2655       if (offsets[1]) {
   2656          y = lp_build_add(int_coord_bld, y, offsets[1]);
   2657       }
   2658       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
   2659       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2660       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
   2661       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2662 
   2663       if (dims >= 3) {
   2664          if (offsets[2]) {
   2665             z = lp_build_add(int_coord_bld, z, offsets[2]);
   2666          }
   2667          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
   2668          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2669          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
   2670          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
   2671       }
   2672    }
   2673 
   2674    lp_build_sample_offset(int_coord_bld,
   2675                           bld->format_desc,
   2676                           x, y, z, row_stride_vec, img_stride_vec,
   2677                           &offset, &i, &j);
   2678 
   2679    if (bld->static_texture_state->target != PIPE_BUFFER) {
   2680       offset = lp_build_add(int_coord_bld, offset,
   2681                             lp_build_get_mip_offsets(bld, ilevel));
   2682    }
   2683 
   2684    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
   2685 
   2686    lp_build_fetch_rgba_soa(bld->gallivm,
   2687                            bld->format_desc,
   2688                            bld->texel_type, TRUE,
   2689                            bld->base_ptr, offset,
   2690                            i, j,
   2691                            bld->cache,
   2692                            colors_out);
   2693 
   2694    if (out_of_bound_ret_zero) {
   2695       /*
   2696        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
   2697        * Could use min/max above instead of out-of-bounds comparisons
   2698        * if we don't care about the result returned for out-of-bounds.
   2699        */
   2700       for (chan = 0; chan < 4; chan++) {
   2701          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
   2702                                             bld->texel_bld.zero, colors_out[chan]);
   2703       }
   2704    }
   2705 }
   2706 
   2707 
   2708 /**
   2709  * Just set texels to white instead of actually sampling the texture.
   2710  * For debugging.
   2711  */
   2712 void
   2713 lp_build_sample_nop(struct gallivm_state *gallivm,
   2714                     struct lp_type type,
   2715                     const LLVMValueRef *coords,
   2716                     LLVMValueRef texel_out[4])
   2717 {
   2718    LLVMValueRef one = lp_build_one(gallivm, type);
   2719    unsigned chan;
   2720 
   2721    for (chan = 0; chan < 4; chan++) {
   2722       texel_out[chan] = one;
   2723    }
   2724 }
   2725 
   2726 
   2727 /**
   2728  * Build the actual texture sampling code.
   2729  * 'texel' will return a vector of four LLVMValueRefs corresponding to
   2730  * R, G, B, A.
   2731  * \param type  vector float type to use for coords, etc.
   2732  * \param sample_key
   2733  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
   2734  */
   2735 static void
   2736 lp_build_sample_soa_code(struct gallivm_state *gallivm,
   2737                          const struct lp_static_texture_state *static_texture_state,
   2738                          const struct lp_static_sampler_state *static_sampler_state,
   2739                          struct lp_sampler_dynamic_state *dynamic_state,
   2740                          struct lp_type type,
   2741                          unsigned sample_key,
   2742                          unsigned texture_index,
   2743                          unsigned sampler_index,
   2744                          LLVMValueRef context_ptr,
   2745                          LLVMValueRef thread_data_ptr,
   2746                          const LLVMValueRef *coords,
   2747                          const LLVMValueRef *offsets,
   2748                          const struct lp_derivatives *derivs, /* optional */
   2749                          LLVMValueRef lod, /* optional */
   2750                          LLVMValueRef texel_out[4])
   2751 {
   2752    unsigned target = static_texture_state->target;
   2753    unsigned dims = texture_dims(target);
   2754    unsigned num_quads = type.length / 4;
   2755    unsigned mip_filter, min_img_filter, mag_img_filter, i;
   2756    struct lp_build_sample_context bld;
   2757    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
   2758    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
   2759    LLVMBuilderRef builder = gallivm->builder;
   2760    LLVMValueRef tex_width, newcoords[5];
   2761    enum lp_sampler_lod_property lod_property;
   2762    enum lp_sampler_lod_control lod_control;
   2763    enum lp_sampler_op_type op_type;
   2764    LLVMValueRef lod_bias = NULL;
   2765    LLVMValueRef explicit_lod = NULL;
   2766    boolean op_is_tex, op_is_lodq, op_is_gather;
   2767 
   2768    if (0) {
   2769       enum pipe_format fmt = static_texture_state->format;
   2770       debug_printf("Sample from %s\n", util_format_name(fmt));
   2771    }
   2772 
   2773    lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
   2774                      LP_SAMPLER_LOD_PROPERTY_SHIFT;
   2775    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
   2776                     LP_SAMPLER_LOD_CONTROL_SHIFT;
   2777    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
   2778                  LP_SAMPLER_OP_TYPE_SHIFT;
   2779 
   2780    op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
   2781    op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
   2782    op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
   2783 
   2784    if (lod_control == LP_SAMPLER_LOD_BIAS) {
   2785       lod_bias = lod;
   2786       assert(lod);
   2787       assert(derivs == NULL);
   2788    }
   2789    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
   2790       explicit_lod = lod;
   2791       assert(lod);
   2792       assert(derivs == NULL);
   2793    }
   2794    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
   2795       assert(derivs);
   2796       assert(lod == NULL);
   2797    }
   2798    else {
   2799       assert(derivs == NULL);
   2800       assert(lod == NULL);
   2801    }
   2802 
   2803    if (static_texture_state->format == PIPE_FORMAT_NONE) {
   2804       /*
   2805        * If there's nothing bound, format is NONE, and we must return
   2806        * all zero as mandated by d3d10 in this case.
   2807        */
   2808       unsigned chan;
   2809       LLVMValueRef zero = lp_build_zero(gallivm, type);
   2810       for (chan = 0; chan < 4; chan++) {
   2811          texel_out[chan] = zero;
   2812       }
   2813       return;
   2814    }
   2815 
   2816    assert(type.floating);
   2817 
   2818    /* Setup our build context */
   2819    memset(&bld, 0, sizeof bld);
   2820    bld.gallivm = gallivm;
   2821    bld.context_ptr = context_ptr;
   2822    bld.static_sampler_state = &derived_sampler_state;
   2823    bld.static_texture_state = static_texture_state;
   2824    bld.dynamic_state = dynamic_state;
   2825    bld.format_desc = util_format_description(static_texture_state->format);
   2826    bld.dims = dims;
   2827 
   2828    if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD || op_is_lodq) {
   2829       bld.no_quad_lod = TRUE;
   2830    }
   2831    if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX || op_is_lodq) {
   2832       bld.no_rho_approx = TRUE;
   2833    }
   2834    if (gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR || op_is_lodq) {
   2835       bld.no_brilinear = TRUE;
   2836    }
   2837 
   2838    bld.vector_width = lp_type_width(type);
   2839 
   2840    bld.float_type = lp_type_float(32);
   2841    bld.int_type = lp_type_int(32);
   2842    bld.coord_type = type;
   2843    bld.int_coord_type = lp_int_type(type);
   2844    bld.float_size_in_type = lp_type_float(32);
   2845    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
   2846    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
   2847    bld.texel_type = type;
   2848 
   2849    /* always using the first channel hopefully should be safe,
   2850     * if not things WILL break in other places anyway.
   2851     */
   2852    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
   2853        bld.format_desc->channel[0].pure_integer) {
   2854       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
   2855          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
   2856       }
   2857       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
   2858          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
   2859       }
   2860    }
   2861    else if (util_format_has_stencil(bld.format_desc) &&
   2862        !util_format_has_depth(bld.format_desc)) {
   2863       /* for stencil only formats, sample stencil (uint) */
   2864       bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
   2865    }
   2866 
   2867    if (!static_texture_state->level_zero_only ||
   2868        !static_sampler_state->max_lod_pos || op_is_lodq) {
   2869       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
   2870    } else {
   2871       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
   2872    }
   2873    if (op_is_gather) {
   2874       /*
   2875        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
   2876        * the actual filtering. Using mostly the same paths, so cube face
   2877        * selection, coord wrapping etc. all naturally uses the same code.
   2878        */
   2879       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
   2880       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
   2881       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
   2882    }
   2883    mip_filter = derived_sampler_state.min_mip_filter;
   2884 
   2885    if (0) {
   2886       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
   2887    }
   2888 
   2889    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
   2890        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
   2891    {
   2892       /*
   2893        * Seamless filtering ignores wrap modes.
   2894        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
   2895        * bilinear it's not correct but way better than using for instance repeat.
   2896        * Note we even set this for non-seamless. Technically GL allows any wrap
   2897        * mode, which made sense when supporting true borders (can get seamless
   2898        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
   2899        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
   2900        * up the sampler state (as it makes it texture dependent).
   2901        */
   2902       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   2903       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   2904    }
   2905    /*
   2906     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
   2907     * so AoS path could be used. Not sure it's worth the trouble...
   2908     */
   2909 
   2910    min_img_filter = derived_sampler_state.min_img_filter;
   2911    mag_img_filter = derived_sampler_state.mag_img_filter;
   2912 
   2913 
   2914    /*
   2915     * This is all a bit complicated different paths are chosen for performance
   2916     * reasons.
   2917     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
   2918     * everything (the last two options are equivalent for 4-wide case).
   2919     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
   2920     * lod is calculated then the lod value extracted afterwards so making this
   2921     * case basically the same as far as lod handling is concerned for the
   2922     * further sample/filter code as the 1 lod for everything case.
   2923     * Different lod handling mostly shows up when building mipmap sizes
   2924     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
   2925     * (getting the fractional part of the lod to the right texels).
   2926     */
   2927 
   2928    /*
   2929     * There are other situations where at least the multiple int lods could be
   2930     * avoided like min and max lod being equal.
   2931     */
   2932    bld.num_mips = bld.num_lods = 1;
   2933 
   2934    if (bld.no_quad_lod && bld.no_rho_approx &&
   2935        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
   2936          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
   2937           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
   2938         op_is_lodq)) {
   2939       /*
   2940        * special case for using per-pixel lod even for implicit lod,
   2941        * which is generally never required (ok by APIs) except to please
   2942        * some (somewhat broken imho) tests (because per-pixel face selection
   2943        * can cause derivatives to be different for pixels outside the primitive
   2944        * due to the major axis division even if pre-project derivatives are
   2945        * looking normal).
   2946        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
   2947        * cube maps we do indeed get per-pixel lod values).
   2948        */
   2949       bld.num_mips = type.length;
   2950       bld.num_lods = type.length;
   2951    }
   2952    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
   2953        (explicit_lod || lod_bias || derivs)) {
   2954       if ((!op_is_tex && target != PIPE_BUFFER) ||
   2955           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
   2956          bld.num_mips = type.length;
   2957          bld.num_lods = type.length;
   2958       }
   2959       else if (op_is_tex && min_img_filter != mag_img_filter) {
   2960          bld.num_mips = 1;
   2961          bld.num_lods = type.length;
   2962       }
   2963    }
   2964    /* TODO: for true scalar_lod should only use 1 lod value */
   2965    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
   2966             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
   2967       bld.num_mips = num_quads;
   2968       bld.num_lods = num_quads;
   2969    }
   2970    else if (op_is_tex && min_img_filter != mag_img_filter) {
   2971       bld.num_mips = 1;
   2972       bld.num_lods = num_quads;
   2973    }
   2974 
   2975 
   2976    bld.lodf_type = type;
   2977    /* we want native vector size to be able to use our intrinsics */
   2978    if (bld.num_lods != type.length) {
   2979       /* TODO: this currently always has to be per-quad or per-element */
   2980       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
   2981    }
   2982    bld.lodi_type = lp_int_type(bld.lodf_type);
   2983    bld.levelf_type = bld.lodf_type;
   2984    if (bld.num_mips == 1) {
   2985       bld.levelf_type.length = 1;
   2986    }
   2987    bld.leveli_type = lp_int_type(bld.levelf_type);
   2988    bld.float_size_type = bld.float_size_in_type;
   2989    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
   2990     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
   2991    if (bld.num_mips > 1) {
   2992       bld.float_size_type.length = bld.num_mips == type.length ?
   2993                                       bld.num_mips * bld.float_size_in_type.length :
   2994                                       type.length;
   2995    }
   2996    bld.int_size_type = lp_int_type(bld.float_size_type);
   2997 
   2998    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
   2999    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
   3000    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
   3001    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
   3002    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
   3003    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
   3004    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
   3005    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
   3006    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
   3007    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
   3008    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
   3009    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
   3010    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
   3011    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
   3012 
   3013    /* Get the dynamic state */
   3014    tex_width = dynamic_state->width(dynamic_state, gallivm,
   3015                                     context_ptr, texture_index);
   3016    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
   3017                                                     context_ptr, texture_index);
   3018    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
   3019                                                     context_ptr, texture_index);
   3020    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
   3021                                           context_ptr, texture_index);
   3022    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
   3023                                                 context_ptr, texture_index);
   3024    /* Note that mip_offsets is an array[level] of offsets to texture images */
   3025 
   3026    if (dynamic_state->cache_ptr && thread_data_ptr) {
   3027       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
   3028                                            thread_data_ptr, texture_index);
   3029    }
   3030 
   3031    /* width, height, depth as single int vector */
   3032    if (dims <= 1) {
   3033       bld.int_size = tex_width;
   3034    }
   3035    else {
   3036       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
   3037                                             tex_width,
   3038                                             LLVMConstInt(i32t, 0, 0), "");
   3039       if (dims >= 2) {
   3040          LLVMValueRef tex_height =
   3041             dynamic_state->height(dynamic_state, gallivm,
   3042                                   context_ptr, texture_index);
   3043          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
   3044                                                tex_height,
   3045                                                LLVMConstInt(i32t, 1, 0), "");
   3046          if (dims >= 3) {
   3047             LLVMValueRef tex_depth =
   3048                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
   3049                                     texture_index);
   3050             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
   3051                                                   tex_depth,
   3052                                                   LLVMConstInt(i32t, 2, 0), "");
   3053          }
   3054       }
   3055    }
   3056 
   3057    for (i = 0; i < 5; i++) {
   3058       newcoords[i] = coords[i];
   3059    }
   3060 
   3061    if (util_format_is_pure_integer(static_texture_state->format) &&
   3062        !util_format_has_depth(bld.format_desc) && op_is_tex &&
   3063        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
   3064         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
   3065         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
   3066       /*
   3067        * Bail if impossible filtering is specified (the awkard additional
   3068        * depth check is because it is legal in gallium to have things like S8Z24
   3069        * here which would say it's pure int despite such formats should sample
   3070        * the depth component).
   3071        * In GL such filters make the texture incomplete, this makes it robust
   3072        * against state trackers which set this up regardless (we'd crash in the
   3073        * lerp later otherwise).
   3074        * At least in some apis it may be legal to use such filters with lod
   3075        * queries and/or gather (at least for gather d3d10 says only the wrap
   3076        * bits are really used hence filter bits are likely simply ignored).
   3077        * For fetch, we don't get valid samplers either way here.
   3078        */
   3079       unsigned chan;
   3080       LLVMValueRef zero = lp_build_zero(gallivm, type);
   3081       for (chan = 0; chan < 4; chan++) {
   3082          texel_out[chan] = zero;
   3083       }
   3084       return;
   3085    }
   3086 
   3087    if (0) {
   3088       /* For debug: no-op texture sampling */
   3089       lp_build_sample_nop(gallivm,
   3090                           bld.texel_type,
   3091                           newcoords,
   3092                           texel_out);
   3093    }
   3094 
   3095    else if (op_type == LP_SAMPLER_OP_FETCH) {
   3096       lp_build_fetch_texel(&bld, texture_index, newcoords,
   3097                            lod, offsets,
   3098                            texel_out);
   3099    }
   3100 
   3101    else {
   3102       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
   3103       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
   3104       boolean use_aos;
   3105 
   3106       use_aos = util_format_fits_8unorm(bld.format_desc) &&
   3107                 op_is_tex &&
   3108                 /* not sure this is strictly needed or simply impossible */
   3109                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
   3110                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
   3111 
   3112       use_aos &= bld.num_lods <= num_quads ||
   3113                  derived_sampler_state.min_img_filter ==
   3114                     derived_sampler_state.mag_img_filter;
   3115       if (dims > 1) {
   3116          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
   3117          if (dims > 2) {
   3118             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
   3119          }
   3120       }
   3121       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
   3122            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
   3123           derived_sampler_state.seamless_cube_map &&
   3124           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
   3125            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
   3126          /* theoretically possible with AoS filtering but not implemented (complex!) */
   3127          use_aos = 0;
   3128       }
   3129 
   3130       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
   3131           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
   3132          debug_printf("%s: using floating point linear filtering for %s\n",
   3133                       __FUNCTION__, bld.format_desc->short_name);
   3134          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
   3135                       "  wraps %d  wrapt %d  wrapr %d\n",
   3136                       derived_sampler_state.min_img_filter,
   3137                       derived_sampler_state.mag_img_filter,
   3138                       derived_sampler_state.min_mip_filter,
   3139                       static_texture_state->target,
   3140                       derived_sampler_state.seamless_cube_map,
   3141                       derived_sampler_state.wrap_s,
   3142                       derived_sampler_state.wrap_t,
   3143                       derived_sampler_state.wrap_r);
   3144       }
   3145 
   3146       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
   3147                              newcoords,
   3148                              derivs, lod_bias, explicit_lod,
   3149                              &lod_positive, &lod, &lod_fpart,
   3150                              &ilevel0, &ilevel1);
   3151 
   3152       if (op_is_lodq) {
   3153          texel_out[0] = lod_fpart;
   3154          texel_out[1] = lod;
   3155          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
   3156          return;
   3157       }
   3158 
   3159       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
   3160          /* The aos path doesn't do seamless filtering so simply add cube layer
   3161           * to face now.
   3162           */
   3163          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
   3164       }
   3165 
   3166       /*
   3167        * we only try 8-wide sampling with soa or if we have AVX2
   3168        * as it appears to be a loss with just AVX)
   3169        */
   3170       if (num_quads == 1 || !use_aos ||
   3171           (util_cpu_caps.has_avx2 &&
   3172            (bld.num_lods == 1 ||
   3173             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
   3174          if (use_aos) {
   3175             /* do sampling/filtering with fixed pt arithmetic */
   3176             lp_build_sample_aos(&bld, sampler_index,
   3177                                 newcoords[0], newcoords[1],
   3178                                 newcoords[2],
   3179                                 offsets, lod_positive, lod_fpart,
   3180                                 ilevel0, ilevel1,
   3181                                 texel_out);
   3182          }
   3183 
   3184          else {
   3185             lp_build_sample_general(&bld, sampler_index,
   3186                                     op_type == LP_SAMPLER_OP_GATHER,
   3187                                     newcoords, offsets,
   3188                                     lod_positive, lod_fpart,
   3189                                     ilevel0, ilevel1,
   3190                                     texel_out);
   3191          }
   3192       }
   3193       else {
   3194          unsigned j;
   3195          struct lp_build_sample_context bld4;
   3196          struct lp_type type4 = type;
   3197          unsigned i;
   3198          LLVMValueRef texelout4[4];
   3199          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
   3200 
   3201          type4.length = 4;
   3202 
   3203          /* Setup our build context */
   3204          memset(&bld4, 0, sizeof bld4);
   3205          bld4.no_quad_lod = bld.no_quad_lod;
   3206          bld4.no_rho_approx = bld.no_rho_approx;
   3207          bld4.no_brilinear = bld.no_brilinear;
   3208          bld4.gallivm = bld.gallivm;
   3209          bld4.context_ptr = bld.context_ptr;
   3210          bld4.static_texture_state = bld.static_texture_state;
   3211          bld4.static_sampler_state = bld.static_sampler_state;
   3212          bld4.dynamic_state = bld.dynamic_state;
   3213          bld4.format_desc = bld.format_desc;
   3214          bld4.dims = bld.dims;
   3215          bld4.row_stride_array = bld.row_stride_array;
   3216          bld4.img_stride_array = bld.img_stride_array;
   3217          bld4.base_ptr = bld.base_ptr;
   3218          bld4.mip_offsets = bld.mip_offsets;
   3219          bld4.int_size = bld.int_size;
   3220          bld4.cache = bld.cache;
   3221 
   3222          bld4.vector_width = lp_type_width(type4);
   3223 
   3224          bld4.float_type = lp_type_float(32);
   3225          bld4.int_type = lp_type_int(32);
   3226          bld4.coord_type = type4;
   3227          bld4.int_coord_type = lp_int_type(type4);
   3228          bld4.float_size_in_type = lp_type_float(32);
   3229          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
   3230          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
   3231          bld4.texel_type = bld.texel_type;
   3232          bld4.texel_type.length = 4;
   3233 
   3234          bld4.num_mips = bld4.num_lods = 1;
   3235          if (bld4.no_quad_lod && bld4.no_rho_approx &&
   3236              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
   3237               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
   3238              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
   3239             bld4.num_mips = type4.length;
   3240             bld4.num_lods = type4.length;
   3241          }
   3242          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
   3243              (explicit_lod || lod_bias || derivs)) {
   3244             if ((!op_is_tex && target != PIPE_BUFFER) ||
   3245                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
   3246                bld4.num_mips = type4.length;
   3247                bld4.num_lods = type4.length;
   3248             }
   3249             else if (op_is_tex && min_img_filter != mag_img_filter) {
   3250                bld4.num_mips = 1;
   3251                bld4.num_lods = type4.length;
   3252             }
   3253          }
   3254 
   3255          /* we want native vector size to be able to use our intrinsics */
   3256          bld4.lodf_type = type4;
   3257          if (bld4.num_lods != type4.length) {
   3258             bld4.lodf_type.length = 1;
   3259          }
   3260          bld4.lodi_type = lp_int_type(bld4.lodf_type);
   3261          bld4.levelf_type = type4;
   3262          if (bld4.num_mips != type4.length) {
   3263             bld4.levelf_type.length = 1;
   3264          }
   3265          bld4.leveli_type = lp_int_type(bld4.levelf_type);
   3266          bld4.float_size_type = bld4.float_size_in_type;
   3267          if (bld4.num_mips > 1) {
   3268             bld4.float_size_type.length = bld4.num_mips == type4.length ?
   3269                                             bld4.num_mips * bld4.float_size_in_type.length :
   3270                                             type4.length;
   3271          }
   3272          bld4.int_size_type = lp_int_type(bld4.float_size_type);
   3273 
   3274          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
   3275          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
   3276          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
   3277          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
   3278          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
   3279          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
   3280          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
   3281          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
   3282          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
   3283          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
   3284          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
   3285          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
   3286          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
   3287          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
   3288 
   3289          for (i = 0; i < num_quads; i++) {
   3290             LLVMValueRef s4, t4, r4;
   3291             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
   3292             LLVMValueRef ilevel04, ilevel14 = NULL;
   3293             LLVMValueRef offsets4[4] = { NULL };
   3294             unsigned num_lods = bld4.num_lods;
   3295 
   3296             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
   3297             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
   3298             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
   3299 
   3300             if (offsets[0]) {
   3301                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
   3302                if (dims > 1) {
   3303                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
   3304                   if (dims > 2) {
   3305                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
   3306                   }
   3307                }
   3308             }
   3309             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
   3310             ilevel04 = bld.num_mips == 1 ? ilevel0 :
   3311                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
   3312             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
   3313                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
   3314                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
   3315             }
   3316 
   3317             if (use_aos) {
   3318                /* do sampling/filtering with fixed pt arithmetic */
   3319                lp_build_sample_aos(&bld4, sampler_index,
   3320                                    s4, t4, r4, offsets4,
   3321                                    lod_positive4, lod_fpart4,
   3322                                    ilevel04, ilevel14,
   3323                                    texelout4);
   3324             }
   3325 
   3326             else {
   3327                /* this path is currently unreachable and hence might break easily... */
   3328                LLVMValueRef newcoords4[5];
   3329                newcoords4[0] = s4;
   3330                newcoords4[1] = t4;
   3331                newcoords4[2] = r4;
   3332                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
   3333                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
   3334 
   3335                lp_build_sample_general(&bld4, sampler_index,
   3336                                        op_type == LP_SAMPLER_OP_GATHER,
   3337                                        newcoords4, offsets4,
   3338                                        lod_positive4, lod_fpart4,
   3339                                        ilevel04, ilevel14,
   3340                                        texelout4);
   3341             }
   3342             for (j = 0; j < 4; j++) {
   3343                texelouttmp[j][i] = texelout4[j];
   3344             }
   3345          }
   3346 
   3347          for (j = 0; j < 4; j++) {
   3348             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
   3349          }
   3350       }
   3351    }
   3352 
   3353    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
   3354       apply_sampler_swizzle(&bld, texel_out);
   3355    }
   3356 
   3357    /*
   3358     * texel type can be a (32bit) int/uint (for pure int formats only),
   3359     * however we are expected to always return floats (storage is untyped).
   3360     */
   3361    if (!bld.texel_type.floating) {
   3362       unsigned chan;
   3363       for (chan = 0; chan < 4; chan++) {
   3364          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
   3365                                             lp_build_vec_type(gallivm, type), "");
   3366       }
   3367    }
   3368 }
   3369 
   3370 
   3371 #define USE_TEX_FUNC_CALL 1
   3372 
   3373 #define LP_MAX_TEX_FUNC_ARGS 32
   3374 
   3375 static inline void
   3376 get_target_info(enum pipe_texture_target target,
   3377                 unsigned *num_coords, unsigned *num_derivs,
   3378                 unsigned *num_offsets, unsigned *layer)
   3379 {
   3380    unsigned dims = texture_dims(target);
   3381    *num_coords = dims;
   3382    *num_offsets = dims;
   3383    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
   3384                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
   3385    *layer = has_layer_coord(target) ? 2: 0;
   3386    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
   3387       /*
   3388        * dims doesn't include r coord for cubes - this is handled
   3389        * by layer instead, but need to fix up for cube arrays...
   3390        */
   3391       *layer = 3;
   3392       *num_coords = 3;
   3393    }
   3394 }
   3395 
   3396 
   3397 /**
   3398  * Generate the function body for a texture sampling function.
   3399  */
   3400 static void
   3401 lp_build_sample_gen_func(struct gallivm_state *gallivm,
   3402                          const struct lp_static_texture_state *static_texture_state,
   3403                          const struct lp_static_sampler_state *static_sampler_state,
   3404                          struct lp_sampler_dynamic_state *dynamic_state,
   3405                          struct lp_type type,
   3406                          unsigned texture_index,
   3407                          unsigned sampler_index,
   3408                          LLVMValueRef function,
   3409                          unsigned num_args,
   3410                          unsigned sample_key)
   3411 {
   3412    LLVMBuilderRef old_builder;
   3413    LLVMBasicBlockRef block;
   3414    LLVMValueRef coords[5];
   3415    LLVMValueRef offsets[3] = { NULL };
   3416    LLVMValueRef lod = NULL;
   3417    LLVMValueRef context_ptr;
   3418    LLVMValueRef thread_data_ptr = NULL;
   3419    LLVMValueRef texel_out[4];
   3420    struct lp_derivatives derivs;
   3421    struct lp_derivatives *deriv_ptr = NULL;
   3422    unsigned num_param = 0;
   3423    unsigned i, num_coords, num_derivs, num_offsets, layer;
   3424    enum lp_sampler_lod_control lod_control;
   3425    boolean need_cache = FALSE;
   3426 
   3427    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
   3428                     LP_SAMPLER_LOD_CONTROL_SHIFT;
   3429 
   3430    get_target_info(static_texture_state->target,
   3431                    &num_coords, &num_derivs, &num_offsets, &layer);
   3432 
   3433    if (dynamic_state->cache_ptr) {
   3434       const struct util_format_description *format_desc;
   3435       format_desc = util_format_description(static_texture_state->format);
   3436       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
   3437          need_cache = TRUE;
   3438       }
   3439    }
   3440 
   3441    /* "unpack" arguments */
   3442    context_ptr = LLVMGetParam(function, num_param++);
   3443    if (need_cache) {
   3444       thread_data_ptr = LLVMGetParam(function, num_param++);
   3445    }
   3446    for (i = 0; i < num_coords; i++) {
   3447       coords[i] = LLVMGetParam(function, num_param++);
   3448    }
   3449    for (i = num_coords; i < 5; i++) {
   3450       /* This is rather unfortunate... */
   3451       coords[i] = lp_build_undef(gallivm, type);
   3452    }
   3453    if (layer) {
   3454       coords[layer] = LLVMGetParam(function, num_param++);
   3455    }
   3456    if (sample_key & LP_SAMPLER_SHADOW) {
   3457       coords[4] = LLVMGetParam(function, num_param++);
   3458    }
   3459    if (sample_key & LP_SAMPLER_OFFSETS) {
   3460       for (i = 0; i < num_offsets; i++) {
   3461          offsets[i] = LLVMGetParam(function, num_param++);
   3462       }
   3463    }
   3464    if (lod_control == LP_SAMPLER_LOD_BIAS ||
   3465        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
   3466       lod = LLVMGetParam(function, num_param++);
   3467    }
   3468    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
   3469       for (i = 0; i < num_derivs; i++) {
   3470          derivs.ddx[i] = LLVMGetParam(function, num_param++);
   3471          derivs.ddy[i] = LLVMGetParam(function, num_param++);
   3472       }
   3473       deriv_ptr = &derivs;
   3474    }
   3475 
   3476    assert(num_args == num_param);
   3477 
   3478    /*
   3479     * Function body
   3480     */
   3481 
   3482    old_builder = gallivm->builder;
   3483    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
   3484    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
   3485    LLVMPositionBuilderAtEnd(gallivm->builder, block);
   3486 
   3487    lp_build_sample_soa_code(gallivm,
   3488                             static_texture_state,
   3489                             static_sampler_state,
   3490                             dynamic_state,
   3491                             type,
   3492                             sample_key,
   3493                             texture_index,
   3494                             sampler_index,
   3495                             context_ptr,
   3496                             thread_data_ptr,
   3497                             coords,
   3498                             offsets,
   3499                             deriv_ptr,
   3500                             lod,
   3501                             texel_out);
   3502 
   3503    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
   3504 
   3505    LLVMDisposeBuilder(gallivm->builder);
   3506    gallivm->builder = old_builder;
   3507 
   3508    gallivm_verify_function(gallivm, function);
   3509 }
   3510 
   3511 
   3512 /**
   3513  * Call the matching function for texture sampling.
   3514  * If there's no match, generate a new one.
   3515  */
   3516 static void
   3517 lp_build_sample_soa_func(struct gallivm_state *gallivm,
   3518                          const struct lp_static_texture_state *static_texture_state,
   3519                          const struct lp_static_sampler_state *static_sampler_state,
   3520                          struct lp_sampler_dynamic_state *dynamic_state,
   3521                          const struct lp_sampler_params *params)
   3522 {
   3523    LLVMBuilderRef builder = gallivm->builder;
   3524    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
   3525                              LLVMGetInsertBlock(builder)));
   3526    LLVMValueRef function, inst;
   3527    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
   3528    LLVMBasicBlockRef bb;
   3529    LLVMValueRef tex_ret;
   3530    unsigned num_args = 0;
   3531    char func_name[64];
   3532    unsigned i, num_coords, num_derivs, num_offsets, layer;
   3533    unsigned texture_index = params->texture_index;
   3534    unsigned sampler_index = params->sampler_index;
   3535    unsigned sample_key = params->sample_key;
   3536    const LLVMValueRef *coords = params->coords;
   3537    const LLVMValueRef *offsets = params->offsets;
   3538    const struct lp_derivatives *derivs = params->derivs;
   3539    enum lp_sampler_lod_control lod_control;
   3540    boolean need_cache = FALSE;
   3541 
   3542    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
   3543                     LP_SAMPLER_LOD_CONTROL_SHIFT;
   3544 
   3545    get_target_info(static_texture_state->target,
   3546                    &num_coords, &num_derivs, &num_offsets, &layer);
   3547 
   3548    if (dynamic_state->cache_ptr) {
   3549       const struct util_format_description *format_desc;
   3550       format_desc = util_format_description(static_texture_state->format);
   3551       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
   3552          /*
   3553           * This is not 100% correct, if we have cache but the
   3554           * util_format_s3tc_prefer is true the cache won't get used
   3555           * regardless (could hook up the block decode there...) */
   3556          need_cache = TRUE;
   3557       }
   3558    }
   3559    /*
   3560     * texture function matches are found by name.
   3561     * Thus the name has to include both the texture and sampler unit
   3562     * (which covers all static state) plus the actual texture function
   3563     * (including things like offsets, shadow coord, lod control).
   3564     * Additionally lod_property has to be included too.
   3565     */
   3566 
   3567    util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
   3568                  texture_index, sampler_index, sample_key);
   3569 
   3570    function = LLVMGetNamedFunction(module, func_name);
   3571 
   3572    if(!function) {
   3573       LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
   3574       LLVMTypeRef ret_type;
   3575       LLVMTypeRef function_type;
   3576       LLVMTypeRef val_type[4];
   3577       unsigned num_param = 0;
   3578 
   3579       /*
   3580        * Generate the function prototype.
   3581        */
   3582 
   3583       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
   3584       if (need_cache) {
   3585          arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
   3586       }
   3587       for (i = 0; i < num_coords; i++) {
   3588          arg_types[num_param++] = LLVMTypeOf(coords[0]);
   3589          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
   3590       }
   3591       if (layer) {
   3592          arg_types[num_param++] = LLVMTypeOf(coords[layer]);
   3593          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
   3594       }
   3595       if (sample_key & LP_SAMPLER_SHADOW) {
   3596          arg_types[num_param++] = LLVMTypeOf(coords[0]);
   3597       }
   3598       if (sample_key & LP_SAMPLER_OFFSETS) {
   3599          for (i = 0; i < num_offsets; i++) {
   3600             arg_types[num_param++] = LLVMTypeOf(offsets[0]);
   3601             assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
   3602          }
   3603       }
   3604       if (lod_control == LP_SAMPLER_LOD_BIAS ||
   3605           lod_control == LP_SAMPLER_LOD_EXPLICIT) {
   3606          arg_types[num_param++] = LLVMTypeOf(params->lod);
   3607       }
   3608       else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
   3609          for (i = 0; i < num_derivs; i++) {
   3610             arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
   3611             arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
   3612             assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
   3613             assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
   3614          }
   3615       }
   3616 
   3617       val_type[0] = val_type[1] = val_type[2] = val_type[3] =
   3618          lp_build_vec_type(gallivm, params->type);
   3619       ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
   3620       function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
   3621       function = LLVMAddFunction(module, func_name, function_type);
   3622 
   3623       for (i = 0; i < num_param; ++i) {
   3624          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
   3625 
   3626             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
   3627          }
   3628       }
   3629 
   3630       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
   3631       LLVMSetLinkage(function, LLVMInternalLinkage);
   3632 
   3633       lp_build_sample_gen_func(gallivm,
   3634                                static_texture_state,
   3635                                static_sampler_state,
   3636                                dynamic_state,
   3637                                params->type,
   3638                                texture_index,
   3639                                sampler_index,
   3640                                function,
   3641                                num_param,
   3642                                sample_key);
   3643    }
   3644 
   3645    num_args = 0;
   3646    args[num_args++] = params->context_ptr;
   3647    if (need_cache) {
   3648       args[num_args++] = params->thread_data_ptr;
   3649    }
   3650    for (i = 0; i < num_coords; i++) {
   3651       args[num_args++] = coords[i];
   3652    }
   3653    if (layer) {
   3654       args[num_args++] = coords[layer];
   3655    }
   3656    if (sample_key & LP_SAMPLER_SHADOW) {
   3657       args[num_args++] = coords[4];
   3658    }
   3659    if (sample_key & LP_SAMPLER_OFFSETS) {
   3660       for (i = 0; i < num_offsets; i++) {
   3661          args[num_args++] = offsets[i];
   3662       }
   3663    }
   3664    if (lod_control == LP_SAMPLER_LOD_BIAS ||
   3665        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
   3666       args[num_args++] = params->lod;
   3667    }
   3668    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
   3669       for (i = 0; i < num_derivs; i++) {
   3670          args[num_args++] = derivs->ddx[i];
   3671          args[num_args++] = derivs->ddy[i];
   3672       }
   3673    }
   3674 
   3675    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
   3676 
   3677    tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
   3678    bb = LLVMGetInsertBlock(builder);
   3679    inst = LLVMGetLastInstruction(bb);
   3680    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
   3681 
   3682    for (i = 0; i < 4; i++) {
   3683       params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
   3684    }
   3685 }
   3686 
   3687 
   3688 /**
   3689  * Build texture sampling code.
   3690  * Either via a function call or inline it directly.
   3691  */
   3692 void
   3693 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
   3694                     const struct lp_static_sampler_state *static_sampler_state,
   3695                     struct lp_sampler_dynamic_state *dynamic_state,
   3696                     struct gallivm_state *gallivm,
   3697                     const struct lp_sampler_params *params)
   3698 {
   3699    boolean use_tex_func = FALSE;
   3700 
   3701    /*
   3702     * Do not use a function call if the sampling is "simple enough".
   3703     * We define this by
   3704     * a) format
   3705     * b) no mips (either one level only or no mip filter)
   3706     * No mips will definitely make the code smaller, though
   3707     * the format requirement is a bit iffy - there's some (SoA) formats
   3708     * which definitely generate less code. This does happen to catch
   3709     * some important cases though which are hurt quite a bit by using
   3710     * a call (though not really because of the call overhead but because
   3711     * they are reusing the same texture unit with some of the same
   3712     * parameters).
   3713     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
   3714     */
   3715 
   3716    if (USE_TEX_FUNC_CALL) {
   3717       const struct util_format_description *format_desc;
   3718       boolean simple_format;
   3719       boolean simple_tex;
   3720       enum lp_sampler_op_type op_type;
   3721       format_desc = util_format_description(static_texture_state->format);
   3722       simple_format = !format_desc ||
   3723                          (util_format_is_rgba8_variant(format_desc) &&
   3724                           format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
   3725 
   3726       op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
   3727                     LP_SAMPLER_OP_TYPE_SHIFT;
   3728       simple_tex =
   3729          op_type != LP_SAMPLER_OP_TEXTURE ||
   3730            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
   3731              static_texture_state->level_zero_only == TRUE) &&
   3732             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
   3733 
   3734       use_tex_func = format_desc && !(simple_format && simple_tex);
   3735    }
   3736 
   3737    if (use_tex_func) {
   3738       lp_build_sample_soa_func(gallivm,
   3739                                static_texture_state,
   3740                                static_sampler_state,
   3741                                dynamic_state,
   3742                                params);
   3743    }
   3744    else {
   3745       lp_build_sample_soa_code(gallivm,
   3746                                static_texture_state,
   3747                                static_sampler_state,
   3748                                dynamic_state,
   3749                                params->type,
   3750                                params->sample_key,
   3751                                params->texture_index,
   3752                                params->sampler_index,
   3753                                params->context_ptr,
   3754                                params->thread_data_ptr,
   3755                                params->coords,
   3756                                params->offsets,
   3757                                params->derivs,
   3758                                params->lod,
   3759                                params->texel);
   3760    }
   3761 }
   3762 
   3763 
   3764 void
   3765 lp_build_size_query_soa(struct gallivm_state *gallivm,
   3766                         const struct lp_static_texture_state *static_state,
   3767                         struct lp_sampler_dynamic_state *dynamic_state,
   3768                         const struct lp_sampler_size_query_params *params)
   3769 {
   3770    LLVMValueRef lod, level = 0, size;
   3771    LLVMValueRef first_level = NULL;
   3772    int dims, i;
   3773    boolean has_array;
   3774    unsigned num_lods = 1;
   3775    struct lp_build_context bld_int_vec4;
   3776    LLVMValueRef context_ptr = params->context_ptr;
   3777    unsigned texture_unit = params->texture_unit;
   3778    unsigned target = params->target;
   3779 
   3780    if (static_state->format == PIPE_FORMAT_NONE) {
   3781       /*
   3782        * If there's nothing bound, format is NONE, and we must return
   3783        * all zero as mandated by d3d10 in this case.
   3784        */
   3785       unsigned chan;
   3786       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
   3787       for (chan = 0; chan < 4; chan++) {
   3788          params->sizes_out[chan] = zero;
   3789       }
   3790       return;
   3791    }
   3792 
   3793    /*
   3794     * Do some sanity verification about bound texture and shader dcl target.
   3795     * Not entirely sure what's possible but assume array/non-array
   3796     * always compatible (probably not ok for OpenGL but d3d10 has no
   3797     * distinction of arrays at the resource level).
   3798     * Everything else looks bogus (though not entirely sure about rect/2d).
   3799     * Currently disabled because it causes assertion failures if there's
   3800     * nothing bound (or rather a dummy texture, not that this case would
   3801     * return the right values).
   3802     */
   3803    if (0 && static_state->target != target) {
   3804       if (static_state->target == PIPE_TEXTURE_1D)
   3805          assert(target == PIPE_TEXTURE_1D_ARRAY);
   3806       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
   3807          assert(target == PIPE_TEXTURE_1D);
   3808       else if (static_state->target == PIPE_TEXTURE_2D)
   3809          assert(target == PIPE_TEXTURE_2D_ARRAY);
   3810       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
   3811          assert(target == PIPE_TEXTURE_2D);
   3812       else if (static_state->target == PIPE_TEXTURE_CUBE)
   3813          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
   3814       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
   3815          assert(target == PIPE_TEXTURE_CUBE);
   3816       else
   3817          assert(0);
   3818    }
   3819 
   3820    dims = texture_dims(target);
   3821 
   3822    switch (target) {
   3823    case PIPE_TEXTURE_1D_ARRAY:
   3824    case PIPE_TEXTURE_2D_ARRAY:
   3825    case PIPE_TEXTURE_CUBE_ARRAY:
   3826       has_array = TRUE;
   3827       break;
   3828    default:
   3829       has_array = FALSE;
   3830       break;
   3831    }
   3832 
   3833    assert(!params->int_type.floating);
   3834 
   3835    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
   3836 
   3837    if (params->explicit_lod) {
   3838       /* FIXME: this needs to honor per-element lod */
   3839       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
   3840                                     lp_build_const_int32(gallivm, 0), "");
   3841       first_level = dynamic_state->first_level(dynamic_state, gallivm,
   3842                                                context_ptr, texture_unit);
   3843       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
   3844       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
   3845    } else {
   3846       lod = bld_int_vec4.zero;
   3847    }
   3848 
   3849    size = bld_int_vec4.undef;
   3850 
   3851    size = LLVMBuildInsertElement(gallivm->builder, size,
   3852                                  dynamic_state->width(dynamic_state, gallivm,
   3853                                                       context_ptr, texture_unit),
   3854                                  lp_build_const_int32(gallivm, 0), "");
   3855 
   3856    if (dims >= 2) {
   3857       size = LLVMBuildInsertElement(gallivm->builder, size,
   3858                                     dynamic_state->height(dynamic_state, gallivm,
   3859                                                           context_ptr, texture_unit),
   3860                                     lp_build_const_int32(gallivm, 1), "");
   3861    }
   3862 
   3863    if (dims >= 3) {
   3864       size = LLVMBuildInsertElement(gallivm->builder, size,
   3865                                     dynamic_state->depth(dynamic_state, gallivm,
   3866                                                          context_ptr, texture_unit),
   3867                                     lp_build_const_int32(gallivm, 2), "");
   3868    }
   3869 
   3870    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
   3871 
   3872    if (has_array) {
   3873       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
   3874                                                  context_ptr, texture_unit);
   3875       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
   3876          /*
   3877           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
   3878           * Could avoid this by passing in number of cubes instead of total
   3879           * number of layers (might make things easier elsewhere too).
   3880           */
   3881          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
   3882          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
   3883       }
   3884       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
   3885                                     lp_build_const_int32(gallivm, dims), "");
   3886    }
   3887 
   3888    /*
   3889     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
   3890     * if level is out of bounds (note this can't cover unbound texture
   3891     * here, which also requires returning zero).
   3892     */
   3893    if (params->explicit_lod && params->is_sviewinfo) {
   3894       LLVMValueRef last_level, out, out1;
   3895       struct lp_build_context leveli_bld;
   3896 
   3897       /* everything is scalar for now */
   3898       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
   3899       last_level = dynamic_state->last_level(dynamic_state, gallivm,
   3900                                              context_ptr, texture_unit);
   3901 
   3902       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
   3903       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
   3904       out = lp_build_or(&leveli_bld, out, out1);
   3905       if (num_lods == 1) {
   3906          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
   3907       }
   3908       else {
   3909          /* TODO */
   3910          assert(0);
   3911       }
   3912       size = lp_build_andnot(&bld_int_vec4, size, out);
   3913    }
   3914    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
   3915       params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
   3916                                                 size,
   3917                                                 lp_build_const_int32(gallivm, i));
   3918    }
   3919    if (params->is_sviewinfo) {
   3920       for (; i < 4; i++) {
   3921          params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
   3922       }
   3923    }
   3924 
   3925    /*
   3926     * if there's no explicit_lod (buffers, rects) queries requiring nr of
   3927     * mips would be illegal.
   3928     */
   3929    if (params->is_sviewinfo && params->explicit_lod) {
   3930       struct lp_build_context bld_int_scalar;
   3931       LLVMValueRef num_levels;
   3932       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
   3933 
   3934       if (static_state->level_zero_only) {
   3935          num_levels = bld_int_scalar.one;
   3936       }
   3937       else {
   3938          LLVMValueRef last_level;
   3939 
   3940          last_level = dynamic_state->last_level(dynamic_state, gallivm,
   3941                                                 context_ptr, texture_unit);
   3942          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
   3943          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
   3944       }
   3945       params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
   3946                                         num_levels);
   3947    }
   3948 }
   3949