Home | History | Annotate | Download | only in gallivm
      1 /**************************************************************************
      2  *
      3  * Copyright 2009 VMware, Inc.
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 /**
     29  * @file
     30  * Texture sampling -- common code.
     31  *
     32  * @author Jose Fonseca <jfonseca (at) vmware.com>
     33  */
     34 
     35 #include "pipe/p_defines.h"
     36 #include "pipe/p_state.h"
     37 #include "util/u_format.h"
     38 #include "util/u_math.h"
     39 #include "util/u_cpu_detect.h"
     40 #include "lp_bld_arit.h"
     41 #include "lp_bld_const.h"
     42 #include "lp_bld_debug.h"
     43 #include "lp_bld_printf.h"
     44 #include "lp_bld_flow.h"
     45 #include "lp_bld_sample.h"
     46 #include "lp_bld_swizzle.h"
     47 #include "lp_bld_type.h"
     48 #include "lp_bld_logic.h"
     49 #include "lp_bld_pack.h"
     50 #include "lp_bld_quad.h"
     51 #include "lp_bld_bitarit.h"
     52 
     53 
     54 /*
     55  * Bri-linear factor. Should be greater than one.
     56  */
     57 #define BRILINEAR_FACTOR 2
     58 
     59 /**
     60  * Does the given texture wrap mode allow sampling the texture border color?
     61  * XXX maybe move this into gallium util code.
     62  */
     63 boolean
     64 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
     65                                        unsigned min_img_filter,
     66                                        unsigned mag_img_filter)
     67 {
     68    switch (mode) {
     69    case PIPE_TEX_WRAP_REPEAT:
     70    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
     71    case PIPE_TEX_WRAP_MIRROR_REPEAT:
     72    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
     73       return FALSE;
     74    case PIPE_TEX_WRAP_CLAMP:
     75    case PIPE_TEX_WRAP_MIRROR_CLAMP:
     76       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
     77           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
     78          return FALSE;
     79       } else {
     80          return TRUE;
     81       }
     82    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
     83    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
     84       return TRUE;
     85    default:
     86       assert(0 && "unexpected wrap mode");
     87       return FALSE;
     88    }
     89 }
     90 
     91 
     92 /**
     93  * Initialize lp_sampler_static_texture_state object with the gallium
     94  * texture/sampler_view state (this contains the parts which are
     95  * considered static).
     96  */
     97 void
     98 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
     99                                 const struct pipe_sampler_view *view)
    100 {
    101    const struct pipe_resource *texture;
    102 
    103    memset(state, 0, sizeof *state);
    104 
    105    if (!view || !view->texture)
    106       return;
    107 
    108    texture = view->texture;
    109 
    110    state->format            = view->format;
    111    state->swizzle_r         = view->swizzle_r;
    112    state->swizzle_g         = view->swizzle_g;
    113    state->swizzle_b         = view->swizzle_b;
    114    state->swizzle_a         = view->swizzle_a;
    115 
    116    state->target            = view->target;
    117    state->pot_width         = util_is_power_of_two(texture->width0);
    118    state->pot_height        = util_is_power_of_two(texture->height0);
    119    state->pot_depth         = util_is_power_of_two(texture->depth0);
    120    state->level_zero_only   = !view->u.tex.last_level;
    121 
    122    /*
    123     * the layer / element / level parameters are all either dynamic
    124     * state or handled transparently wrt execution.
    125     */
    126 }
    127 
    128 
    129 /**
    130  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
    131  * state (this contains the parts which are considered static).
    132  */
    133 void
    134 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
    135                                 const struct pipe_sampler_state *sampler)
    136 {
    137    memset(state, 0, sizeof *state);
    138 
    139    if (!sampler)
    140       return;
    141 
    142    /*
    143     * We don't copy sampler state over unless it is actually enabled, to avoid
    144     * spurious recompiles, as the sampler static state is part of the shader
    145     * key.
    146     *
    147     * Ideally the state tracker or cso_cache module would make all state
    148     * canonical, but until that happens it's better to be safe than sorry here.
    149     *
    150     * XXX: Actually there's much more than can be done here, especially
    151     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
    152     */
    153 
    154    state->wrap_s            = sampler->wrap_s;
    155    state->wrap_t            = sampler->wrap_t;
    156    state->wrap_r            = sampler->wrap_r;
    157    state->min_img_filter    = sampler->min_img_filter;
    158    state->mag_img_filter    = sampler->mag_img_filter;
    159    state->seamless_cube_map = sampler->seamless_cube_map;
    160 
    161    if (sampler->max_lod > 0.0f) {
    162       state->min_mip_filter = sampler->min_mip_filter;
    163    } else {
    164       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    165    }
    166 
    167    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE ||
    168        state->min_img_filter != state->mag_img_filter) {
    169       if (sampler->lod_bias != 0.0f) {
    170          state->lod_bias_non_zero = 1;
    171       }
    172 
    173       /* If min_lod == max_lod we can greatly simplify mipmap selection.
    174        * This is a case that occurs during automatic mipmap generation.
    175        */
    176       if (sampler->min_lod == sampler->max_lod) {
    177          state->min_max_lod_equal = 1;
    178       } else {
    179          if (sampler->min_lod > 0.0f) {
    180             state->apply_min_lod = 1;
    181          }
    182 
    183          /*
    184           * XXX this won't do anything with the mesa state tracker which always
    185           * sets max_lod to not more than actually present mip maps...
    186           */
    187          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
    188             state->apply_max_lod = 1;
    189          }
    190       }
    191    }
    192 
    193    state->compare_mode      = sampler->compare_mode;
    194    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
    195       state->compare_func   = sampler->compare_func;
    196    }
    197 
    198    state->normalized_coords = sampler->normalized_coords;
    199 }
    200 
    201 
    202 /**
    203  * Generate code to compute coordinate gradient (rho).
    204  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
    205  *
    206  * The resulting rho has bld->levelf format (per quad or per element).
    207  */
    208 static LLVMValueRef
    209 lp_build_rho(struct lp_build_sample_context *bld,
    210              unsigned texture_unit,
    211              LLVMValueRef s,
    212              LLVMValueRef t,
    213              LLVMValueRef r,
    214              LLVMValueRef cube_rho,
    215              const struct lp_derivatives *derivs)
    216 {
    217    struct gallivm_state *gallivm = bld->gallivm;
    218    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
    219    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
    220    struct lp_build_context *float_bld = &bld->float_bld;
    221    struct lp_build_context *coord_bld = &bld->coord_bld;
    222    struct lp_build_context *rho_bld = &bld->lodf_bld;
    223    const unsigned dims = bld->dims;
    224    LLVMValueRef ddx_ddy[2] = {NULL};
    225    LLVMBuilderRef builder = bld->gallivm->builder;
    226    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
    227    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
    228    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
    229    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
    230    LLVMValueRef rho_vec;
    231    LLVMValueRef int_size, float_size;
    232    LLVMValueRef rho;
    233    LLVMValueRef first_level, first_level_vec;
    234    unsigned length = coord_bld->type.length;
    235    unsigned num_quads = length / 4;
    236    boolean rho_per_quad = rho_bld->type.length != length;
    237    boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1);
    238    unsigned i;
    239    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    240    LLVMValueRef rho_xvec, rho_yvec;
    241 
    242    /* Note that all simplified calculations will only work for isotropic filtering */
    243 
    244    /*
    245     * rho calcs are always per quad except for explicit derivs (excluding
    246     * the messy cube maps for now) when requested.
    247     */
    248 
    249    first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
    250                                                  bld->context_ptr, texture_unit);
    251    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
    252    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE);
    253    float_size = lp_build_int_to_float(float_size_bld, int_size);
    254 
    255    if (cube_rho) {
    256       LLVMValueRef cubesize;
    257       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
    258 
    259       /*
    260        * Cube map code did already everything except size mul and per-quad extraction.
    261        * Luckily cube maps are always quadratic!
    262        */
    263       if (rho_per_quad) {
    264          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    265                                          rho_bld->type, cube_rho, 0);
    266       }
    267       else {
    268          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
    269       }
    270       /* Could optimize this for single quad just skip the broadcast */
    271       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
    272                                             rho_bld->type, float_size, index0);
    273       /* skipping sqrt hence returning rho squared */
    274       cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
    275       rho = lp_build_mul(rho_bld, cubesize, rho);
    276    }
    277    else if (derivs) {
    278       LLVMValueRef ddmax[3], ddx[3], ddy[3];
    279       for (i = 0; i < dims; i++) {
    280          LLVMValueRef floatdim;
    281          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
    282 
    283          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
    284                                                coord_bld->type, float_size, indexi);
    285 
    286          /*
    287           * note that for rho_per_quad case could reduce math (at some shuffle
    288           * cost), but for now use same code to per-pixel lod case.
    289           */
    290          if (no_rho_opt) {
    291             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
    292             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
    293             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
    294             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
    295          }
    296          else {
    297             LLVMValueRef tmpx, tmpy;
    298             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
    299             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
    300             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
    301             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
    302          }
    303       }
    304       if (no_rho_opt) {
    305          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
    306          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
    307          if (dims > 2) {
    308             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
    309             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
    310          }
    311          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
    312          /* skipping sqrt hence returning rho squared */
    313      }
    314       else {
    315          rho = ddmax[0];
    316          if (dims > 1) {
    317             rho = lp_build_max(coord_bld, rho, ddmax[1]);
    318             if (dims > 2) {
    319                rho = lp_build_max(coord_bld, rho, ddmax[2]);
    320             }
    321          }
    322       }
    323       if (rho_per_quad) {
    324          /*
    325           * rho_vec contains per-pixel rho, convert to scalar per quad.
    326           */
    327          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    328                                          rho_bld->type, rho, 0);
    329       }
    330    }
    331    else {
    332       /*
    333        * This looks all a bit complex, but it's not that bad
    334        * (the shuffle code makes it look worse than it is).
    335        * Still, might not be ideal for all cases.
    336        */
    337       static const unsigned char swizzle0[] = { /* no-op swizzle */
    338          0, LP_BLD_SWIZZLE_DONTCARE,
    339          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    340       };
    341       static const unsigned char swizzle1[] = {
    342          1, LP_BLD_SWIZZLE_DONTCARE,
    343          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    344       };
    345       static const unsigned char swizzle2[] = {
    346          2, LP_BLD_SWIZZLE_DONTCARE,
    347          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    348       };
    349 
    350       if (dims < 2) {
    351          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
    352       }
    353       else if (dims >= 2) {
    354          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
    355          if (dims > 2) {
    356             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
    357          }
    358       }
    359 
    360       if (no_rho_opt) {
    361          static const unsigned char swizzle01[] = { /* no-op swizzle */
    362             0, 1,
    363             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    364          };
    365          static const unsigned char swizzle23[] = {
    366             2, 3,
    367             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    368          };
    369          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
    370 
    371          for (i = 0; i < num_quads; i++) {
    372             shuffles[i*4+0] = shuffles[i*4+1] = index0;
    373             shuffles[i*4+2] = shuffles[i*4+3] = index1;
    374          }
    375          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
    376                                            LLVMConstVector(shuffles, length), "");
    377          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
    378          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
    379          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
    380          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
    381          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
    382 
    383          if (dims > 2) {
    384             static const unsigned char swizzle02[] = {
    385                0, 2,
    386                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    387             };
    388             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
    389                                                   coord_bld->type, float_size, index2);
    390             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
    391             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
    392             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
    393             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
    394          }
    395 
    396          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
    397          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
    398          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
    399 
    400          if (rho_per_quad) {
    401             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    402                                             rho_bld->type, rho, 0);
    403          }
    404          else {
    405             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
    406          }
    407          /* skipping sqrt hence returning rho squared */
    408       }
    409       else {
    410          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
    411          if (dims > 2) {
    412             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
    413          }
    414          else {
    415             ddx_ddy[1] = NULL; /* silence compiler warning */
    416          }
    417 
    418          if (dims < 2) {
    419             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
    420             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
    421          }
    422          else if (dims == 2) {
    423             static const unsigned char swizzle02[] = {
    424                0, 2,
    425                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    426             };
    427             static const unsigned char swizzle13[] = {
    428                1, 3,
    429                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
    430             };
    431             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
    432             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
    433          }
    434          else {
    435             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
    436             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
    437             assert(dims == 3);
    438             for (i = 0; i < num_quads; i++) {
    439                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
    440                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
    441                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
    442                shuffles1[4*i + 3] = i32undef;
    443                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
    444                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
    445                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
    446                shuffles2[4*i + 3] = i32undef;
    447             }
    448             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
    449                                               LLVMConstVector(shuffles1, length), "");
    450             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
    451                                               LLVMConstVector(shuffles2, length), "");
    452          }
    453 
    454          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
    455 
    456          if (bld->coord_type.length > 4) {
    457             /* expand size to each quad */
    458             if (dims > 1) {
    459                /* could use some broadcast_vector helper for this? */
    460                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
    461                for (i = 0; i < num_quads; i++) {
    462                   src[i] = float_size;
    463                }
    464                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
    465             }
    466             else {
    467                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
    468             }
    469             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
    470 
    471             if (dims <= 1) {
    472                rho = rho_vec;
    473             }
    474             else {
    475                if (dims >= 2) {
    476                   LLVMValueRef rho_s, rho_t, rho_r;
    477 
    478                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
    479                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
    480 
    481                   rho = lp_build_max(coord_bld, rho_s, rho_t);
    482 
    483                   if (dims >= 3) {
    484                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
    485                      rho = lp_build_max(coord_bld, rho, rho_r);
    486                   }
    487                }
    488             }
    489             if (rho_per_quad) {
    490                rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
    491                                                rho_bld->type, rho, 0);
    492             }
    493             else {
    494                rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
    495             }
    496          }
    497          else {
    498             if (dims <= 1) {
    499                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
    500             }
    501             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
    502 
    503             if (dims <= 1) {
    504                rho = rho_vec;
    505             }
    506             else {
    507                if (dims >= 2) {
    508                   LLVMValueRef rho_s, rho_t, rho_r;
    509 
    510                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
    511                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
    512 
    513                   rho = lp_build_max(float_bld, rho_s, rho_t);
    514 
    515                   if (dims >= 3) {
    516                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
    517                      rho = lp_build_max(float_bld, rho, rho_r);
    518                   }
    519                }
    520             }
    521             if (!rho_per_quad) {
    522                rho = lp_build_broadcast_scalar(rho_bld, rho);
    523             }
    524          }
    525       }
    526    }
    527 
    528    return rho;
    529 }
    530 
    531 
    532 /*
    533  * Bri-linear lod computation
    534  *
    535  * Use a piece-wise linear approximation of log2 such that:
    536  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
    537  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
    538  *   with the steepness specified in 'factor'
    539  * - exact result for 0.5, 1.5, etc.
    540  *
    541  *
    542  *   1.0 -              /----*
    543  *                     /
    544  *                    /
    545  *                   /
    546  *   0.5 -          *
    547  *                 /
    548  *                /
    549  *               /
    550  *   0.0 - *----/
    551  *
    552  *         |                 |
    553  *        2^0               2^1
    554  *
    555  * This is a technique also commonly used in hardware:
    556  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
    557  *
    558  * TODO: For correctness, this should only be applied when texture is known to
    559  * have regular mipmaps, i.e., mipmaps derived from the base level.
    560  *
    561  * TODO: This could be done in fixed point, where applicable.
    562  */
    563 static void
    564 lp_build_brilinear_lod(struct lp_build_context *bld,
    565                        LLVMValueRef lod,
    566                        double factor,
    567                        LLVMValueRef *out_lod_ipart,
    568                        LLVMValueRef *out_lod_fpart)
    569 {
    570    LLVMValueRef lod_fpart;
    571    double pre_offset = (factor - 0.5)/factor - 0.5;
    572    double post_offset = 1 - factor;
    573 
    574    if (0) {
    575       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
    576    }
    577 
    578    lod = lp_build_add(bld, lod,
    579                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
    580 
    581    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
    582 
    583    lod_fpart = lp_build_mad(bld, lod_fpart,
    584                             lp_build_const_vec(bld->gallivm, bld->type, factor),
    585                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
    586 
    587    /*
    588     * It's not necessary to clamp lod_fpart since:
    589     * - the above expression will never produce numbers greater than one.
    590     * - the mip filtering branch is only taken if lod_fpart is positive
    591     */
    592 
    593    *out_lod_fpart = lod_fpart;
    594 
    595    if (0) {
    596       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
    597       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
    598    }
    599 }
    600 
    601 
    602 /*
    603  * Combined log2 and brilinear lod computation.
    604  *
    605  * It's in all identical to calling lp_build_fast_log2() and
    606  * lp_build_brilinear_lod() above, but by combining we can compute the integer
    607  * and fractional part independently.
    608  */
    609 static void
    610 lp_build_brilinear_rho(struct lp_build_context *bld,
    611                        LLVMValueRef rho,
    612                        double factor,
    613                        LLVMValueRef *out_lod_ipart,
    614                        LLVMValueRef *out_lod_fpart)
    615 {
    616    LLVMValueRef lod_ipart;
    617    LLVMValueRef lod_fpart;
    618 
    619    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
    620    const double post_offset = 1 - 2*factor;
    621 
    622    assert(bld->type.floating);
    623 
    624    assert(lp_check_value(bld->type, rho));
    625 
    626    /*
    627     * The pre factor will make the intersections with the exact powers of two
    628     * happen precisely where we want them to be, which means that the integer
    629     * part will not need any post adjustments.
    630     */
    631    rho = lp_build_mul(bld, rho,
    632                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
    633 
    634    /* ipart = ifloor(log2(rho)) */
    635    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
    636 
    637    /* fpart = rho / 2**ipart */
    638    lod_fpart = lp_build_extract_mantissa(bld, rho);
    639 
    640    lod_fpart = lp_build_mad(bld, lod_fpart,
    641                             lp_build_const_vec(bld->gallivm, bld->type, factor),
    642                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
    643 
    644    /*
    645     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
    646     * - the above expression will never produce numbers greater than one.
    647     * - the mip filtering branch is only taken if lod_fpart is positive
    648     */
    649 
    650    *out_lod_ipart = lod_ipart;
    651    *out_lod_fpart = lod_fpart;
    652 }
    653 
    654 
    655 /**
    656  * Fast implementation of iround(log2(sqrt(x))), based on
    657  * log2(x^n) == n*log2(x).
    658  *
    659  * Gives accurate results all the time.
    660  * (Could be trivially extended to handle other power-of-two roots.)
    661  */
    662 static LLVMValueRef
    663 lp_build_ilog2_sqrt(struct lp_build_context *bld,
    664                     LLVMValueRef x)
    665 {
    666    LLVMBuilderRef builder = bld->gallivm->builder;
    667    LLVMValueRef ipart;
    668    struct lp_type i_type = lp_int_type(bld->type);
    669    LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
    670 
    671    assert(bld->type.floating);
    672 
    673    assert(lp_check_value(bld->type, x));
    674 
    675    /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
    676    ipart = lp_build_extract_exponent(bld, x, 1);
    677    ipart = LLVMBuildAShr(builder, ipart, one, "");
    678 
    679    return ipart;
    680 }
    681 
    682 
    683 /**
    684  * Generate code to compute texture level of detail (lambda).
    685  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
    686  * \param lod_bias  optional float vector with the shader lod bias
    687  * \param explicit_lod  optional float vector with the explicit lod
    688  * \param cube_rho  rho calculated by cube coord mapping (optional)
    689  * \param out_lod_ipart  integer part of lod
    690  * \param out_lod_fpart  float part of lod (never larger than 1 but may be negative)
    691  * \param out_lod_positive  (mask) if lod is positive (i.e. texture is minified)
    692  *
    693  * The resulting lod can be scalar per quad or be per element.
    694  */
    695 void
    696 lp_build_lod_selector(struct lp_build_sample_context *bld,
    697                       unsigned texture_unit,
    698                       unsigned sampler_unit,
    699                       LLVMValueRef s,
    700                       LLVMValueRef t,
    701                       LLVMValueRef r,
    702                       LLVMValueRef cube_rho,
    703                       const struct lp_derivatives *derivs,
    704                       LLVMValueRef lod_bias, /* optional */
    705                       LLVMValueRef explicit_lod, /* optional */
    706                       unsigned mip_filter,
    707                       LLVMValueRef *out_lod_ipart,
    708                       LLVMValueRef *out_lod_fpart,
    709                       LLVMValueRef *out_lod_positive)
    710 
    711 {
    712    LLVMBuilderRef builder = bld->gallivm->builder;
    713    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
    714    struct lp_build_context *lodf_bld = &bld->lodf_bld;
    715    LLVMValueRef lod;
    716 
    717    *out_lod_ipart = bld->lodi_bld.zero;
    718    *out_lod_positive = bld->lodi_bld.zero;
    719    *out_lod_fpart = lodf_bld->zero;
    720 
    721    /*
    722     * For determining min/mag, we follow GL 4.1 spec, 3.9.12 Texture Magnification:
    723     * "Implementations may either unconditionally assume c = 0 for the minification
    724     * vs. magnification switch-over point, or may choose to make c depend on the
    725     * combination of minification and magnification modes as follows: if the
    726     * magnification filter is given by LINEAR and the minification filter is given
    727     * by NEAREST_MIPMAP_NEAREST or NEAREST_MIPMAP_LINEAR, then c = 0.5. This is
    728     * done to ensure that a minified texture does not appear "sharper" than a
    729     * magnified texture. Otherwise c = 0."
    730     * And 3.9.11 Texture Minification:
    731     * "If lod is less than or equal to the constant c (see section 3.9.12) the
    732     * texture is said to be magnified; if it is greater, the texture is minified."
    733     * So, using 0 as switchover point always, and using magnification for lod == 0.
    734     * Note that the always c = 0 behavior is new (first appearing in GL 3.1 spec),
    735     * old GL versions required 0.5 for the modes listed above.
    736     * I have no clue about the (undocumented) wishes of d3d9/d3d10 here!
    737     */
    738 
    739    if (bld->static_sampler_state->min_max_lod_equal) {
    740       /* User is forcing sampling from a particular mipmap level.
    741        * This is hit during mipmap generation.
    742        */
    743       LLVMValueRef min_lod =
    744          dynamic_state->min_lod(dynamic_state, bld->gallivm,
    745                                 bld->context_ptr, sampler_unit);
    746 
    747       lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
    748    }
    749    else {
    750       if (explicit_lod) {
    751          if (bld->num_lods != bld->coord_type.length)
    752             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
    753                                             lodf_bld->type, explicit_lod, 0);
    754          else
    755             lod = explicit_lod;
    756       }
    757       else {
    758          LLVMValueRef rho;
    759          boolean rho_squared = ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
    760                                 (bld->dims > 1)) || cube_rho;
    761 
    762          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
    763 
    764          /*
    765           * Compute lod = log2(rho)
    766           */
    767 
    768          if (!lod_bias &&
    769              !bld->static_sampler_state->lod_bias_non_zero &&
    770              !bld->static_sampler_state->apply_max_lod &&
    771              !bld->static_sampler_state->apply_min_lod) {
    772             /*
    773              * Special case when there are no post-log2 adjustments, which
    774              * saves instructions but keeping the integer and fractional lod
    775              * computations separate from the start.
    776              */
    777 
    778             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
    779                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
    780                /*
    781                 * Don't actually need both values all the time, lod_ipart is
    782                 * needed for nearest mipfilter, lod_positive if min != mag.
    783                 */
    784                if (rho_squared) {
    785                   *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
    786                }
    787                else {
    788                   *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
    789                }
    790                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
    791                                                 rho, lodf_bld->one);
    792                return;
    793             }
    794             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
    795                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) &&
    796                 !rho_squared) {
    797                /*
    798                 * This can't work if rho is squared. Not sure if it could be
    799                 * fixed while keeping it worthwile, could also do sqrt here
    800                 * but brilinear and no_rho_opt seems like a combination not
    801                 * making much sense anyway so just use ordinary path below.
    802                 */
    803                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
    804                                       out_lod_ipart, out_lod_fpart);
    805                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
    806                                                 rho, lodf_bld->one);
    807                return;
    808             }
    809          }
    810 
    811          if (0) {
    812             lod = lp_build_log2(lodf_bld, rho);
    813          }
    814          else {
    815             lod = lp_build_fast_log2(lodf_bld, rho);
    816          }
    817          if (rho_squared) {
    818             /* log2(x^2) == 0.5*log2(x) */
    819             lod = lp_build_mul(lodf_bld, lod,
    820                                lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
    821          }
    822 
    823          /* add shader lod bias */
    824          if (lod_bias) {
    825             if (bld->num_lods != bld->coord_type.length)
    826                lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
    827                                                     lodf_bld->type, lod_bias, 0);
    828             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
    829          }
    830       }
    831 
    832       /* add sampler lod bias */
    833       if (bld->static_sampler_state->lod_bias_non_zero) {
    834          LLVMValueRef sampler_lod_bias =
    835             dynamic_state->lod_bias(dynamic_state, bld->gallivm,
    836                                     bld->context_ptr, sampler_unit);
    837          sampler_lod_bias = lp_build_broadcast_scalar(lodf_bld,
    838                                                       sampler_lod_bias);
    839          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
    840       }
    841 
    842       /* clamp lod */
    843       if (bld->static_sampler_state->apply_max_lod) {
    844          LLVMValueRef max_lod =
    845             dynamic_state->max_lod(dynamic_state, bld->gallivm,
    846                                    bld->context_ptr, sampler_unit);
    847          max_lod = lp_build_broadcast_scalar(lodf_bld, max_lod);
    848 
    849          lod = lp_build_min(lodf_bld, lod, max_lod);
    850       }
    851       if (bld->static_sampler_state->apply_min_lod) {
    852          LLVMValueRef min_lod =
    853             dynamic_state->min_lod(dynamic_state, bld->gallivm,
    854                                    bld->context_ptr, sampler_unit);
    855          min_lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
    856 
    857          lod = lp_build_max(lodf_bld, lod, min_lod);
    858       }
    859    }
    860 
    861    *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
    862                                     lod, lodf_bld->zero);
    863 
    864    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
    865       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
    866          lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR,
    867                                 out_lod_ipart, out_lod_fpart);
    868       }
    869       else {
    870          lp_build_ifloor_fract(lodf_bld, lod, out_lod_ipart, out_lod_fpart);
    871       }
    872 
    873       lp_build_name(*out_lod_fpart, "lod_fpart");
    874    }
    875    else {
    876       *out_lod_ipart = lp_build_iround(lodf_bld, lod);
    877    }
    878 
    879    lp_build_name(*out_lod_ipart, "lod_ipart");
    880 
    881    return;
    882 }
    883 
    884 
    885 /**
    886  * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod
    887  * to actual mip level.
    888  * Note: this is all scalar per quad code.
    889  * \param lod_ipart  int texture level of detail
    890  * \param level_out  returns integer
    891  * \param out_of_bounds returns per coord out_of_bounds mask if provided
    892  */
    893 void
    894 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
    895                            unsigned texture_unit,
    896                            LLVMValueRef lod_ipart,
    897                            LLVMValueRef *level_out,
    898                            LLVMValueRef *out_of_bounds)
    899 {
    900    struct lp_build_context *leveli_bld = &bld->leveli_bld;
    901    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
    902    LLVMValueRef first_level, last_level, level;
    903 
    904    first_level = dynamic_state->first_level(dynamic_state, bld->gallivm,
    905                                             bld->context_ptr, texture_unit);
    906    last_level = dynamic_state->last_level(dynamic_state, bld->gallivm,
    907                                           bld->context_ptr, texture_unit);
    908    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
    909    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
    910 
    911    level = lp_build_add(leveli_bld, lod_ipart, first_level);
    912 
    913    if (out_of_bounds) {
    914       LLVMValueRef out, out1;
    915       out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level);
    916       out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level);
    917       out = lp_build_or(leveli_bld, out, out1);
    918       if (bld->num_mips == bld->coord_bld.type.length) {
    919          *out_of_bounds = out;
    920       }
    921       else if (bld->num_mips == 1) {
    922          *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out);
    923       }
    924       else {
    925          assert(bld->num_mips == bld->coord_bld.type.length / 4);
    926          *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
    927                                                                 leveli_bld->type,
    928                                                                 bld->int_coord_bld.type,
    929                                                                 out);
    930       }
    931       level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds);
    932       *level_out = level;
    933    }
    934    else {
    935       /* clamp level to legal range of levels */
    936       *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
    937 
    938    }
    939 }
    940 
    941 
    942 /**
    943  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad (or per element) int LOD(s)
    944  * to two (per-quad) (adjacent) mipmap level indexes, and fix up float lod
    945  * part accordingly.
    946  * Later, we'll sample from those two mipmap levels and interpolate between them.
    947  */
    948 void
    949 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
    950                            unsigned texture_unit,
    951                            LLVMValueRef lod_ipart,
    952                            LLVMValueRef *lod_fpart_inout,
    953                            LLVMValueRef *level0_out,
    954                            LLVMValueRef *level1_out)
    955 {
    956    LLVMBuilderRef builder = bld->gallivm->builder;
    957    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
    958    struct lp_build_context *leveli_bld = &bld->leveli_bld;
    959    struct lp_build_context *levelf_bld = &bld->levelf_bld;
    960    LLVMValueRef first_level, last_level;
    961    LLVMValueRef clamp_min;
    962    LLVMValueRef clamp_max;
    963 
    964    assert(bld->num_lods == bld->num_mips);
    965 
    966    first_level = dynamic_state->first_level(dynamic_state, bld->gallivm,
    967                                             bld->context_ptr, texture_unit);
    968    last_level = dynamic_state->last_level(dynamic_state, bld->gallivm,
    969                                           bld->context_ptr, texture_unit);
    970    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
    971    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
    972 
    973    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
    974    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
    975 
    976    /*
    977     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
    978     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
    979     * ends in the process.
    980     */
    981 
    982    /* *level0_out < first_level */
    983    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
    984                              *level0_out, first_level,
    985                              "clamp_lod_to_first");
    986 
    987    *level0_out = LLVMBuildSelect(builder, clamp_min,
    988                                  first_level, *level0_out, "");
    989 
    990    *level1_out = LLVMBuildSelect(builder, clamp_min,
    991                                  first_level, *level1_out, "");
    992 
    993    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
    994                                       levelf_bld->zero, *lod_fpart_inout, "");
    995 
    996    /* *level0_out >= last_level */
    997    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
    998                              *level0_out, last_level,
    999                              "clamp_lod_to_last");
   1000 
   1001    *level0_out = LLVMBuildSelect(builder, clamp_max,
   1002                                  last_level, *level0_out, "");
   1003 
   1004    *level1_out = LLVMBuildSelect(builder, clamp_max,
   1005                                  last_level, *level1_out, "");
   1006 
   1007    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
   1008                                       levelf_bld->zero, *lod_fpart_inout, "");
   1009 
   1010    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
   1011    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
   1012    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
   1013 }
   1014 
   1015 
   1016 /**
   1017  * Return pointer to a single mipmap level.
   1018  * \param level  integer mipmap level
   1019  */
   1020 LLVMValueRef
   1021 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
   1022                           LLVMValueRef level)
   1023 {
   1024    LLVMBuilderRef builder = bld->gallivm->builder;
   1025    LLVMValueRef indexes[2], data_ptr, mip_offset;
   1026 
   1027    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   1028    indexes[1] = level;
   1029    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1030    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
   1031    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
   1032    return data_ptr;
   1033 }
   1034 
   1035 /**
   1036  * Return (per-pixel) offsets to mip levels.
   1037  * \param level  integer mipmap level
   1038  */
   1039 LLVMValueRef
   1040 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
   1041                          LLVMValueRef level)
   1042 {
   1043    LLVMBuilderRef builder = bld->gallivm->builder;
   1044    LLVMValueRef indexes[2], offsets, offset1;
   1045 
   1046    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   1047    if (bld->num_mips == 1) {
   1048       indexes[1] = level;
   1049       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1050       offset1 = LLVMBuildLoad(builder, offset1, "");
   1051       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
   1052    }
   1053    else if (bld->num_mips == bld->coord_bld.type.length / 4) {
   1054       unsigned i;
   1055 
   1056       offsets = bld->int_coord_bld.undef;
   1057       for (i = 0; i < bld->num_mips; i++) {
   1058          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1059          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
   1060          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1061          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1062          offset1 = LLVMBuildLoad(builder, offset1, "");
   1063          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
   1064       }
   1065       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
   1066    }
   1067    else {
   1068       unsigned i;
   1069 
   1070       assert (bld->num_mips == bld->coord_bld.type.length);
   1071 
   1072       offsets = bld->int_coord_bld.undef;
   1073       for (i = 0; i < bld->num_mips; i++) {
   1074          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1075          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1076          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
   1077          offset1 = LLVMBuildLoad(builder, offset1, "");
   1078          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
   1079       }
   1080    }
   1081    return offsets;
   1082 }
   1083 
   1084 
   1085 /**
   1086  * Codegen equivalent for u_minify().
   1087  * @param lod_scalar  if lod is a (broadcasted) scalar
   1088  * Return max(1, base_size >> level);
   1089  */
   1090 LLVMValueRef
   1091 lp_build_minify(struct lp_build_context *bld,
   1092                 LLVMValueRef base_size,
   1093                 LLVMValueRef level,
   1094                 boolean lod_scalar)
   1095 {
   1096    LLVMBuilderRef builder = bld->gallivm->builder;
   1097    assert(lp_check_value(bld->type, base_size));
   1098    assert(lp_check_value(bld->type, level));
   1099 
   1100    if (level == bld->zero) {
   1101       /* if we're using mipmap level zero, no minification is needed */
   1102       return base_size;
   1103    }
   1104    else {
   1105       LLVMValueRef size;
   1106       assert(bld->type.sign);
   1107       if (lod_scalar ||
   1108          (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
   1109          size = LLVMBuildLShr(builder, base_size, level, "minify");
   1110          size = lp_build_max(bld, size, bld->one);
   1111       }
   1112       else {
   1113          /*
   1114           * emulate shift with float mul, since intel "forgot" shifts with
   1115           * per-element shift count until avx2, which results in terrible
   1116           * scalar extraction (both count and value), scalar shift,
   1117           * vector reinsertion. Should not be an issue on any non-x86 cpu
   1118           * with a vector instruction set.
   1119           * On cpus with AMD's XOP this should also be unnecessary but I'm
   1120           * not sure if llvm would emit this with current flags.
   1121           */
   1122          LLVMValueRef const127, const23, lf;
   1123          struct lp_type ftype;
   1124          struct lp_build_context fbld;
   1125          ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
   1126          lp_build_context_init(&fbld, bld->gallivm, ftype);
   1127          const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
   1128          const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
   1129 
   1130          /* calculate 2^(-level) float */
   1131          lf = lp_build_sub(bld, const127, level);
   1132          lf = lp_build_shl(bld, lf, const23);
   1133          lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
   1134 
   1135          /* finish shift operation by doing float mul */
   1136          base_size = lp_build_int_to_float(&fbld, base_size);
   1137          size = lp_build_mul(&fbld, base_size, lf);
   1138          /*
   1139           * do the max also with floats because
   1140           * a) non-emulated int max requires sse41
   1141           *    (this is actually a lie as we could cast to 16bit values
   1142           *    as 16bit is sufficient and 16bit int max is sse2)
   1143           * b) with avx we can do int max 4-wide but float max 8-wide
   1144           */
   1145          size = lp_build_max(&fbld, size, fbld.one);
   1146          size = lp_build_itrunc(&fbld, size);
   1147       }
   1148       return size;
   1149    }
   1150 }
   1151 
   1152 
   1153 /**
   1154  * Dereference stride_array[mipmap_level] array to get a stride.
   1155  * Return stride as a vector.
   1156  */
   1157 static LLVMValueRef
   1158 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
   1159                               LLVMValueRef stride_array, LLVMValueRef level)
   1160 {
   1161    LLVMBuilderRef builder = bld->gallivm->builder;
   1162    LLVMValueRef indexes[2], stride, stride1;
   1163    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   1164    if (bld->num_mips == 1) {
   1165       indexes[1] = level;
   1166       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   1167       stride1 = LLVMBuildLoad(builder, stride1, "");
   1168       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
   1169    }
   1170    else if (bld->num_mips == bld->coord_bld.type.length / 4) {
   1171       LLVMValueRef stride1;
   1172       unsigned i;
   1173 
   1174       stride = bld->int_coord_bld.undef;
   1175       for (i = 0; i < bld->num_mips; i++) {
   1176          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1177          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
   1178          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1179          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   1180          stride1 = LLVMBuildLoad(builder, stride1, "");
   1181          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
   1182       }
   1183       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
   1184    }
   1185    else {
   1186       LLVMValueRef stride1;
   1187       unsigned i;
   1188 
   1189       assert (bld->num_mips == bld->coord_bld.type.length);
   1190 
   1191       stride = bld->int_coord_bld.undef;
   1192       for (i = 0; i < bld->coord_bld.type.length; i++) {
   1193          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1194          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
   1195          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   1196          stride1 = LLVMBuildLoad(builder, stride1, "");
   1197          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
   1198       }
   1199    }
   1200    return stride;
   1201 }
   1202 
   1203 
   1204 /**
   1205  * When sampling a mipmap, we need to compute the width, height, depth
   1206  * of the source levels from the level indexes.  This helper function
   1207  * does that.
   1208  */
   1209 void
   1210 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
   1211                             LLVMValueRef ilevel,
   1212                             LLVMValueRef *out_size,
   1213                             LLVMValueRef *row_stride_vec,
   1214                             LLVMValueRef *img_stride_vec)
   1215 {
   1216    const unsigned dims = bld->dims;
   1217    LLVMValueRef ilevel_vec;
   1218 
   1219    /*
   1220     * Compute width, height, depth at mipmap level 'ilevel'
   1221     */
   1222    if (bld->num_mips == 1) {
   1223       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
   1224       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE);
   1225    }
   1226    else {
   1227       LLVMValueRef int_size_vec;
   1228       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
   1229       unsigned num_quads = bld->coord_bld.type.length / 4;
   1230       unsigned i;
   1231 
   1232       if (bld->num_mips == num_quads) {
   1233          /*
   1234           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
   1235           * intel "forgot" the variable shift count instruction until avx2.
   1236           * A harmless 8x32 shift gets translated into 32 instructions
   1237           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
   1238           * unable to recognize if there are really just 2 different shift
   1239           * count values. So do the shift 4-wide before expansion.
   1240           */
   1241          struct lp_build_context bld4;
   1242          struct lp_type type4;
   1243 
   1244          type4 = bld->int_coord_bld.type;
   1245          type4.length = 4;
   1246 
   1247          lp_build_context_init(&bld4, bld->gallivm, type4);
   1248 
   1249          if (bld->dims == 1) {
   1250             assert(bld->int_size_in_bld.type.length == 1);
   1251             int_size_vec = lp_build_broadcast_scalar(&bld4,
   1252                                                      bld->int_size);
   1253          }
   1254          else {
   1255             assert(bld->int_size_in_bld.type.length == 4);
   1256             int_size_vec = bld->int_size;
   1257          }
   1258 
   1259          for (i = 0; i < num_quads; i++) {
   1260             LLVMValueRef ileveli;
   1261             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1262 
   1263             ileveli = lp_build_extract_broadcast(bld->gallivm,
   1264                                                  bld->leveli_bld.type,
   1265                                                  bld4.type,
   1266                                                  ilevel,
   1267                                                  indexi);
   1268             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE);
   1269          }
   1270          /*
   1271           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
   1272           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
   1273           */
   1274          *out_size = lp_build_concat(bld->gallivm,
   1275                                      tmp,
   1276                                      bld4.type,
   1277                                      num_quads);
   1278       }
   1279       else {
   1280         /* FIXME: this is terrible and results in _huge_ vector
   1281          * (for the dims > 1 case).
   1282          * Should refactor this (together with extract_image_sizes) and do
   1283          * something more useful. Could for instance if we have width,height
   1284          * with 4-wide vector pack all elements into a 8xi16 vector
   1285          * (on which we can still do useful math) instead of using a 16xi32
   1286          * vector.
   1287          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
   1288          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
   1289          */
   1290          assert(bld->num_mips == bld->coord_bld.type.length);
   1291          if (bld->dims == 1) {
   1292             assert(bld->int_size_in_bld.type.length == 1);
   1293             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
   1294                                                      bld->int_size);
   1295             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE);
   1296          }
   1297          else {
   1298             LLVMValueRef ilevel1;
   1299             for (i = 0; i < bld->num_mips; i++) {
   1300                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
   1301                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
   1302                                                     bld->int_size_in_bld.type, ilevel, indexi);
   1303                tmp[i] = bld->int_size;
   1304                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE);
   1305             }
   1306             *out_size = lp_build_concat(bld->gallivm, tmp,
   1307                                         bld->int_size_in_bld.type,
   1308                                         bld->num_mips);
   1309          }
   1310       }
   1311    }
   1312 
   1313    if (dims >= 2) {
   1314       *row_stride_vec = lp_build_get_level_stride_vec(bld,
   1315                                                       bld->row_stride_array,
   1316                                                       ilevel);
   1317    }
   1318    if (dims == 3 || has_layer_coord(bld->static_texture_state->target)) {
   1319       *img_stride_vec = lp_build_get_level_stride_vec(bld,
   1320                                                       bld->img_stride_array,
   1321                                                       ilevel);
   1322    }
   1323 }
   1324 
   1325 
   1326 /**
   1327  * Extract and broadcast texture size.
   1328  *
   1329  * @param size_type   type of the texture size vector (either
   1330  *                    bld->int_size_type or bld->float_size_type)
   1331  * @param coord_type  type of the texture size vector (either
   1332  *                    bld->int_coord_type or bld->coord_type)
   1333  * @param size        vector with the texture size (width, height, depth)
   1334  */
   1335 void
   1336 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
   1337                              struct lp_build_context *size_bld,
   1338                              struct lp_type coord_type,
   1339                              LLVMValueRef size,
   1340                              LLVMValueRef *out_width,
   1341                              LLVMValueRef *out_height,
   1342                              LLVMValueRef *out_depth)
   1343 {
   1344    const unsigned dims = bld->dims;
   1345    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   1346    struct lp_type size_type = size_bld->type;
   1347 
   1348    if (bld->num_mips == 1) {
   1349       *out_width = lp_build_extract_broadcast(bld->gallivm,
   1350                                               size_type,
   1351                                               coord_type,
   1352                                               size,
   1353                                               LLVMConstInt(i32t, 0, 0));
   1354       if (dims >= 2) {
   1355          *out_height = lp_build_extract_broadcast(bld->gallivm,
   1356                                                   size_type,
   1357                                                   coord_type,
   1358                                                   size,
   1359                                                   LLVMConstInt(i32t, 1, 0));
   1360          if (dims == 3) {
   1361             *out_depth = lp_build_extract_broadcast(bld->gallivm,
   1362                                                     size_type,
   1363                                                     coord_type,
   1364                                                     size,
   1365                                                     LLVMConstInt(i32t, 2, 0));
   1366          }
   1367       }
   1368    }
   1369    else {
   1370       unsigned num_quads = bld->coord_bld.type.length / 4;
   1371 
   1372       if (dims == 1) {
   1373          *out_width = size;
   1374       }
   1375       else if (bld->num_mips == num_quads) {
   1376          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
   1377          if (dims >= 2) {
   1378             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
   1379             if (dims == 3) {
   1380                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
   1381             }
   1382          }
   1383       }
   1384       else {
   1385          assert(bld->num_mips == bld->coord_type.length);
   1386          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
   1387                                                 coord_type, size, 0);
   1388          if (dims >= 2) {
   1389             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
   1390                                                     coord_type, size, 1);
   1391             if (dims == 3) {
   1392                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
   1393                                                       coord_type, size, 2);
   1394             }
   1395          }
   1396       }
   1397    }
   1398 }
   1399 
   1400 
   1401 /**
   1402  * Unnormalize coords.
   1403  *
   1404  * @param flt_size  vector with the integer texture size (width, height, depth)
   1405  */
   1406 void
   1407 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
   1408                              LLVMValueRef flt_size,
   1409                              LLVMValueRef *s,
   1410                              LLVMValueRef *t,
   1411                              LLVMValueRef *r)
   1412 {
   1413    const unsigned dims = bld->dims;
   1414    LLVMValueRef width;
   1415    LLVMValueRef height;
   1416    LLVMValueRef depth;
   1417 
   1418    lp_build_extract_image_sizes(bld,
   1419                                 &bld->float_size_bld,
   1420                                 bld->coord_type,
   1421                                 flt_size,
   1422                                 &width,
   1423                                 &height,
   1424                                 &depth);
   1425 
   1426    /* s = s * width, t = t * height */
   1427    *s = lp_build_mul(&bld->coord_bld, *s, width);
   1428    if (dims >= 2) {
   1429       *t = lp_build_mul(&bld->coord_bld, *t, height);
   1430       if (dims >= 3) {
   1431          *r = lp_build_mul(&bld->coord_bld, *r, depth);
   1432       }
   1433    }
   1434 }
   1435 
   1436 /**
   1437  * Generate new coords and faces for cubemap texels falling off the face.
   1438  *
   1439  * @param face   face (center) of the pixel
   1440  * @param x0     lower x coord
   1441  * @param x1     higher x coord (must be x0 + 1)
   1442  * @param y0     lower y coord
   1443  * @param y1     higher y coord (must be x0 + 1)
   1444  * @param max_coord     texture cube (level) size - 1
   1445  * @param next_faces    new face values when falling off
   1446  * @param next_xcoords  new x coord values when falling off
   1447  * @param next_ycoords  new y coord values when falling off
   1448  *
   1449  * The arrays hold the new values when under/overflow of
   1450  * lower x, higher x, lower y, higher y coord would occur (in this order).
   1451  * next_xcoords/next_ycoords have two entries each (for both new lower and
   1452  * higher coord).
   1453  */
   1454 void
   1455 lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
   1456                         LLVMValueRef face,
   1457                         LLVMValueRef x0,
   1458                         LLVMValueRef x1,
   1459                         LLVMValueRef y0,
   1460                         LLVMValueRef y1,
   1461                         LLVMValueRef max_coord,
   1462                         LLVMValueRef next_faces[4],
   1463                         LLVMValueRef next_xcoords[4][2],
   1464                         LLVMValueRef next_ycoords[4][2])
   1465 {
   1466    /*
   1467     * Lookup tables aren't nice for simd code hence try some logic here.
   1468     * (Note that while it would not be necessary to do per-sample (4) lookups
   1469     * when using a LUT as it's impossible that texels fall off of positive
   1470     * and negative edges simultaneously, it would however be necessary to
   1471     * do 2 lookups for corner handling as in this case texels both fall off
   1472     * of x and y axes.)
   1473     */
   1474    /*
   1475     * Next faces (for face 012345):
   1476     * x < 0.0  : 451110
   1477     * x >= 1.0 : 540001
   1478     * y < 0.0  : 225422
   1479     * y >= 1.0 : 334533
   1480     * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
   1481     * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
   1482     * nfy+: face & ~4 > 1 ? face + 2 : 3;
   1483     * This could also use pshufb instead, but would need (manually coded)
   1484     * ssse3 intrinsic (llvm won't do non-constant shuffles).
   1485     */
   1486    struct gallivm_state *gallivm = ivec_bld->gallivm;
   1487    LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
   1488    LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
   1489    LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
   1490    LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
   1491    LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
   1492    LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
   1493 
   1494    sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
   1495    tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
   1496    sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one);
   1497    faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
   1498    tmp = lp_build_add(ivec_bld, faceand1, c4);
   1499    next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp);
   1500    next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one);
   1501 
   1502    tmp = lp_build_andnot(ivec_bld, face, c4);
   1503    sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one);
   1504    tmp = lp_build_add(ivec_bld, face, c2);
   1505    next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3);
   1506    next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one);
   1507 
   1508    /*
   1509     * new xcoords (for face 012345):
   1510     * x < 0.0  : max   max   t     max-t max  max
   1511     * x >= 1.0 : 0     0     max-t t     0    0
   1512     * y < 0.0  : max   0     max-s s     s    max-s
   1513     * y >= 1.0 : max   0     s     max-s s    max-s
   1514     *
   1515     * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0
   1516     * ncx[0] = max - ncx[1]
   1517     * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max
   1518     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
   1519     */
   1520    sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2);
   1521    maxmy0 = lp_build_sub(ivec_bld, max_coord, y0);
   1522    tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0);
   1523    next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
   1524    next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]);
   1525    maxmy1 = lp_build_sub(ivec_bld, max_coord, y1);
   1526    tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1);
   1527    next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
   1528    next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]);
   1529 
   1530    sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one);
   1531 
   1532    tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord);
   1533    maxmx0 = lp_build_sub(ivec_bld, max_coord, x0);
   1534    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
   1535    next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1536    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]);
   1537    next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]);
   1538    maxmx1 = lp_build_sub(ivec_bld, max_coord, x1);
   1539    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
   1540    next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1541    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]);
   1542    next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]);
   1543 
   1544    /*
   1545     * new ycoords (for face 012345):
   1546     * x < 0.0  : t     t     0     max   t    t
   1547     * x >= 1.0 : t     t     0     max   t    t
   1548     * y < 0.0  : max-s s     0     max   max  0
   1549     * y >= 1.0 : s     max-s 0     max   0    max
   1550     *
   1551     * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t
   1552     * ncy[1] = ncy[0]
   1553     * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max
   1554     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
   1555     */
   1556    tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord);
   1557    next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0);
   1558    next_ycoords[1][0] = next_ycoords[0][0];
   1559    next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1);
   1560    next_ycoords[1][1] = next_ycoords[0][1];
   1561 
   1562    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
   1563    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
   1564    next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1565    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]);
   1566    next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp);
   1567    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
   1568    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
   1569    next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
   1570    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]);
   1571    next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp);
   1572 }
   1573 
   1574 
   1575 /** Helper used by lp_build_cube_lookup() */
   1576 static LLVMValueRef
   1577 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
   1578 {
   1579    /* ima = +0.5 / abs(coord); */
   1580    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
   1581    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
   1582    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
   1583    return ima;
   1584 }
   1585 
   1586 
   1587 /** Helper for doing 3-wise selection.
   1588  * Returns sel1 ? val2 : (sel0 ? val0 : val1).
   1589  */
   1590 static LLVMValueRef
   1591 lp_build_select3(struct lp_build_context *sel_bld,
   1592                  LLVMValueRef sel0,
   1593                  LLVMValueRef sel1,
   1594                  LLVMValueRef val0,
   1595                  LLVMValueRef val1,
   1596                  LLVMValueRef val2)
   1597 {
   1598    LLVMValueRef tmp;
   1599    tmp = lp_build_select(sel_bld, sel0, val0, val1);
   1600    return lp_build_select(sel_bld, sel1, val2, tmp);
   1601 }
   1602 
   1603 
   1604 /**
   1605  * Generate code to do cube face selection and compute per-face texcoords.
   1606  */
   1607 void
   1608 lp_build_cube_lookup(struct lp_build_sample_context *bld,
   1609                      LLVMValueRef *coords,
   1610                      const struct lp_derivatives *derivs_in, /* optional */
   1611                      LLVMValueRef *rho,
   1612                      struct lp_derivatives *derivs_out, /* optional */
   1613                      boolean need_derivs)
   1614 {
   1615    struct lp_build_context *coord_bld = &bld->coord_bld;
   1616    LLVMBuilderRef builder = bld->gallivm->builder;
   1617    struct gallivm_state *gallivm = bld->gallivm;
   1618    LLVMValueRef si, ti, ri;
   1619 
   1620    /*
   1621     * Do per-pixel face selection. We cannot however (as we used to do)
   1622     * simply calculate the derivs afterwards (which is very bogus for
   1623     * explicit derivs btw) because the values would be "random" when
   1624     * not all pixels lie on the same face. So what we do here is just
   1625     * calculate the derivatives after scaling the coords by the absolute
   1626     * value of the inverse major axis, and essentially do rho calculation
   1627     * steps as if it were a 3d texture. This is perfect if all pixels hit
   1628     * the same face, but not so great at edges, I believe the max error
   1629     * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
   1630     * the 3d distance between 2 points on the cube instead of measuring up/down
   1631     * the edge). Still this is possibly a win over just selecting the same face
   1632     * for all pixels. Unfortunately, something like that doesn't work for
   1633     * explicit derivatives.
   1634     */
   1635    struct lp_build_context *cint_bld = &bld->int_coord_bld;
   1636    struct lp_type intctype = cint_bld->type;
   1637    LLVMTypeRef coord_vec_type = coord_bld->vec_type;
   1638    LLVMTypeRef cint_vec_type = cint_bld->vec_type;
   1639    LLVMValueRef as, at, ar, face, face_s, face_t;
   1640    LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
   1641    LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
   1642    LLVMValueRef tnegi, rnegi;
   1643    LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
   1644    LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
   1645    LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
   1646                                                   1LL << (intctype.width - 1));
   1647    LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
   1648                                                    intctype.width -1);
   1649    LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
   1650    LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
   1651    LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
   1652    LLVMValueRef s = coords[0];
   1653    LLVMValueRef t = coords[1];
   1654    LLVMValueRef r = coords[2];
   1655 
   1656    assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
   1657    assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
   1658    assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
   1659 
   1660    /*
   1661     * get absolute value (for x/y/z face selection) and sign bit
   1662     * (for mirroring minor coords and pos/neg face selection)
   1663     * of the original coords.
   1664     */
   1665    as = lp_build_abs(&bld->coord_bld, s);
   1666    at = lp_build_abs(&bld->coord_bld, t);
   1667    ar = lp_build_abs(&bld->coord_bld, r);
   1668 
   1669    /*
   1670     * major face determination: select x if x > y else select y
   1671     * select z if z >= max(x,y) else select previous result
   1672     * if some axis are the same we chose z over y, y over x - the
   1673     * dx10 spec seems to ask for it while OpenGL doesn't care (if we
   1674     * wouldn't care could save a select or two if using different
   1675     * compares and doing at_g_as_ar last since tnewx and tnewz are the
   1676     * same).
   1677     */
   1678    as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
   1679    maxasat = lp_build_max(coord_bld, as, at);
   1680    ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
   1681 
   1682    if (need_derivs && (derivs_in ||
   1683        ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
   1684         (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) {
   1685       /*
   1686        * XXX: This is really really complex.
   1687        * It is a bit overkill to use this for implicit derivatives as well,
   1688        * no way this is worth the cost in practice, but seems to be the
   1689        * only way for getting accurate and per-pixel lod values.
   1690        */
   1691       LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
   1692       LLVMValueRef madx, mady, madxdivma, madydivma;
   1693       LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
   1694       LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
   1695       LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
   1696       LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
   1697       LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
   1698       /*
   1699        * s = 1/2 * ( sc / ma + 1)
   1700        * t = 1/2 * ( tc / ma + 1)
   1701        *
   1702        * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
   1703        * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
   1704        *
   1705        * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
   1706        * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
   1707        * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
   1708        * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
   1709        */
   1710 
   1711       /* select ma, calculate ima */
   1712       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
   1713       mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
   1714       signmabit = LLVMBuildAnd(builder, mai, signmask, "");
   1715       ima = lp_build_div(coord_bld, coord_bld->one, ma);
   1716       imahalf = lp_build_mul(coord_bld, posHalf, ima);
   1717       imahalfpos = lp_build_abs(coord_bld, imahalf);
   1718 
   1719       if (!derivs_in) {
   1720          ddx[0] = lp_build_ddx(coord_bld, s);
   1721          ddx[1] = lp_build_ddx(coord_bld, t);
   1722          ddx[2] = lp_build_ddx(coord_bld, r);
   1723          ddy[0] = lp_build_ddy(coord_bld, s);
   1724          ddy[1] = lp_build_ddy(coord_bld, t);
   1725          ddy[2] = lp_build_ddy(coord_bld, r);
   1726       }
   1727       else {
   1728          ddx[0] = derivs_in->ddx[0];
   1729          ddx[1] = derivs_in->ddx[1];
   1730          ddx[2] = derivs_in->ddx[2];
   1731          ddy[0] = derivs_in->ddy[0];
   1732          ddy[1] = derivs_in->ddy[1];
   1733          ddy[2] = derivs_in->ddy[2];
   1734       }
   1735 
   1736       /* select major derivatives */
   1737       madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
   1738       mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
   1739 
   1740       si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
   1741       ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
   1742       ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
   1743 
   1744       sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
   1745       tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
   1746       rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
   1747 
   1748       sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
   1749       tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
   1750       rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
   1751 
   1752       /*
   1753        * compute all possible new s/t coords, which does the mirroring,
   1754        * and do the same for derivs minor axes.
   1755        * snewx = signma * -r;
   1756        * tnewx = -t;
   1757        * snewy = s;
   1758        * tnewy = signma * r;
   1759        * snewz = signma * s;
   1760        * tnewz = -t;
   1761        */
   1762       tnegi = LLVMBuildXor(builder, ti, signmask, "");
   1763       rnegi = LLVMBuildXor(builder, ri, signmask, "");
   1764       tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
   1765       rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
   1766       tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
   1767       rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
   1768 
   1769       snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
   1770       tnewx = tnegi;
   1771       sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
   1772       tdxnewx = tdxnegi;
   1773       sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
   1774       tdynewx = tdynegi;
   1775 
   1776       snewy = si;
   1777       tnewy = LLVMBuildXor(builder, signmabit, ri, "");
   1778       sdxnewy = sdxi;
   1779       tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
   1780       sdynewy = sdyi;
   1781       tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
   1782 
   1783       snewz = LLVMBuildXor(builder, signmabit, si, "");
   1784       tnewz = tnegi;
   1785       sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
   1786       tdxnewz = tdxnegi;
   1787       sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
   1788       tdynewz = tdynegi;
   1789 
   1790       /* select the mirrored values */
   1791       face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
   1792       face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
   1793       face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
   1794       face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
   1795       face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
   1796       face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
   1797       face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
   1798 
   1799       face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
   1800       face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
   1801       face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
   1802       face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
   1803       face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
   1804       face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
   1805 
   1806       /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
   1807       madxdivma = lp_build_mul(coord_bld, madx, ima);
   1808       tmp = lp_build_mul(coord_bld, madxdivma, face_s);
   1809       tmp = lp_build_sub(coord_bld, face_sdx, tmp);
   1810       derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
   1811 
   1812       /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
   1813       tmp = lp_build_mul(coord_bld, madxdivma, face_t);
   1814       tmp = lp_build_sub(coord_bld, face_tdx, tmp);
   1815       derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
   1816 
   1817       /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
   1818       madydivma = lp_build_mul(coord_bld, mady, ima);
   1819       tmp = lp_build_mul(coord_bld, madydivma, face_s);
   1820       tmp = lp_build_sub(coord_bld, face_sdy, tmp);
   1821       derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
   1822 
   1823       /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
   1824       tmp = lp_build_mul(coord_bld, madydivma, face_t);
   1825       tmp = lp_build_sub(coord_bld, face_tdy, tmp);
   1826       derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
   1827 
   1828       signma = LLVMBuildLShr(builder, mai, signshift, "");
   1829       coords[2] = LLVMBuildOr(builder, face, signma, "face");
   1830 
   1831       /* project coords */
   1832       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
   1833       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
   1834 
   1835       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
   1836       coords[1] = lp_build_add(coord_bld, face_t, posHalf);
   1837 
   1838       return;
   1839    }
   1840 
   1841    else if (need_derivs) {
   1842       LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
   1843       static const unsigned char swizzle0[] = { /* no-op swizzle */
   1844          0, LP_BLD_SWIZZLE_DONTCARE,
   1845          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1846       };
   1847       static const unsigned char swizzle1[] = {
   1848          1, LP_BLD_SWIZZLE_DONTCARE,
   1849          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1850       };
   1851       static const unsigned char swizzle01[] = { /* no-op swizzle */
   1852          0, 1,
   1853          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1854       };
   1855       static const unsigned char swizzle23[] = {
   1856          2, 3,
   1857          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1858       };
   1859       static const unsigned char swizzle02[] = {
   1860          0, 2,
   1861          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
   1862       };
   1863 
   1864       /*
   1865        * scale the s/t/r coords pre-select/mirror so we can calculate
   1866        * "reasonable" derivs.
   1867        */
   1868       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
   1869       imahalfpos = lp_build_cube_imapos(coord_bld, ma);
   1870       s = lp_build_mul(coord_bld, s, imahalfpos);
   1871       t = lp_build_mul(coord_bld, t, imahalfpos);
   1872       r = lp_build_mul(coord_bld, r, imahalfpos);
   1873 
   1874       /*
   1875        * This isn't quite the same as the "ordinary" (3d deriv) path since we
   1876        * know the texture is square which simplifies things (we can omit the
   1877        * size mul which happens very early completely here and do it at the
   1878        * very end).
   1879        * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX
   1880        * since the error can get quite big otherwise at edges.
   1881        * (With no_rho_approx max error is sqrt(2) at edges, same as it is
   1882        * without no_rho_approx for 2d textures, otherwise it would be factor 2.)
   1883        */
   1884       ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
   1885       ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
   1886 
   1887       ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
   1888       ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
   1889 
   1890       tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
   1891       tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
   1892       tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
   1893 
   1894       rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
   1895       rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
   1896 
   1897       tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
   1898       tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
   1899       *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
   1900    }
   1901 
   1902    if (!need_derivs) {
   1903       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
   1904    }
   1905    mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
   1906    signmabit = LLVMBuildAnd(builder, mai, signmask, "");
   1907 
   1908    si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
   1909    ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
   1910    ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
   1911 
   1912    /*
   1913     * compute all possible new s/t coords, which does the mirroring
   1914     * snewx = signma * -r;
   1915     * tnewx = -t;
   1916     * snewy = s;
   1917     * tnewy = signma * r;
   1918     * snewz = signma * s;
   1919     * tnewz = -t;
   1920     */
   1921    tnegi = LLVMBuildXor(builder, ti, signmask, "");
   1922    rnegi = LLVMBuildXor(builder, ri, signmask, "");
   1923 
   1924    snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
   1925    tnewx = tnegi;
   1926 
   1927    snewy = si;
   1928    tnewy = LLVMBuildXor(builder, signmabit, ri, "");
   1929 
   1930    snewz = LLVMBuildXor(builder, signmabit, si, "");
   1931    tnewz = tnegi;
   1932 
   1933    /* select the mirrored values */
   1934    face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
   1935    face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
   1936    face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
   1937 
   1938    face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
   1939    face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
   1940 
   1941    /* add +1 for neg face */
   1942    /* XXX with AVX probably want to use another select here -
   1943     * as long as we ensure vblendvps gets used we can actually
   1944     * skip the comparison and just use sign as a "mask" directly.
   1945     */
   1946    signma = LLVMBuildLShr(builder, mai, signshift, "");
   1947    coords[2] = LLVMBuildOr(builder, face, signma, "face");
   1948 
   1949    /* project coords */
   1950    if (!need_derivs) {
   1951       imahalfpos = lp_build_cube_imapos(coord_bld, ma);
   1952       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
   1953       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
   1954    }
   1955 
   1956    coords[0] = lp_build_add(coord_bld, face_s, posHalf);
   1957    coords[1] = lp_build_add(coord_bld, face_t, posHalf);
   1958 }
   1959 
   1960 
   1961 /**
   1962  * Compute the partial offset of a pixel block along an arbitrary axis.
   1963  *
   1964  * @param coord   coordinate in pixels
   1965  * @param stride  number of bytes between rows of successive pixel blocks
   1966  * @param block_length  number of pixels in a pixels block along the coordinate
   1967  *                      axis
   1968  * @param out_offset    resulting relative offset of the pixel block in bytes
   1969  * @param out_subcoord  resulting sub-block pixel coordinate
   1970  */
   1971 void
   1972 lp_build_sample_partial_offset(struct lp_build_context *bld,
   1973                                unsigned block_length,
   1974                                LLVMValueRef coord,
   1975                                LLVMValueRef stride,
   1976                                LLVMValueRef *out_offset,
   1977                                LLVMValueRef *out_subcoord)
   1978 {
   1979    LLVMBuilderRef builder = bld->gallivm->builder;
   1980    LLVMValueRef offset;
   1981    LLVMValueRef subcoord;
   1982 
   1983    if (block_length == 1) {
   1984       subcoord = bld->zero;
   1985    }
   1986    else {
   1987       /*
   1988        * Pixel blocks have power of two dimensions. LLVM should convert the
   1989        * rem/div to bit arithmetic.
   1990        * TODO: Verify this.
   1991        * It does indeed BUT it does transform it to scalar (and back) when doing so
   1992        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
   1993        * The generated code looks seriously unfunny and is quite expensive.
   1994        */
   1995 #if 0
   1996       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
   1997       subcoord = LLVMBuildURem(builder, coord, block_width, "");
   1998       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
   1999 #else
   2000       unsigned logbase2 = util_logbase2(block_length);
   2001       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
   2002       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
   2003       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
   2004       coord = LLVMBuildLShr(builder, coord, block_shift, "");
   2005 #endif
   2006    }
   2007 
   2008    offset = lp_build_mul(bld, coord, stride);
   2009 
   2010    assert(out_offset);
   2011    assert(out_subcoord);
   2012 
   2013    *out_offset = offset;
   2014    *out_subcoord = subcoord;
   2015 }
   2016 
   2017 
   2018 /**
   2019  * Compute the offset of a pixel block.
   2020  *
   2021  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
   2022  *
   2023  * Returns the relative offset and i,j sub-block coordinates
   2024  */
   2025 void
   2026 lp_build_sample_offset(struct lp_build_context *bld,
   2027                        const struct util_format_description *format_desc,
   2028                        LLVMValueRef x,
   2029                        LLVMValueRef y,
   2030                        LLVMValueRef z,
   2031                        LLVMValueRef y_stride,
   2032                        LLVMValueRef z_stride,
   2033                        LLVMValueRef *out_offset,
   2034                        LLVMValueRef *out_i,
   2035                        LLVMValueRef *out_j)
   2036 {
   2037    LLVMValueRef x_stride;
   2038    LLVMValueRef offset;
   2039 
   2040    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
   2041                                  format_desc->block.bits/8);
   2042 
   2043    lp_build_sample_partial_offset(bld,
   2044                                   format_desc->block.width,
   2045                                   x, x_stride,
   2046                                   &offset, out_i);
   2047 
   2048    if (y && y_stride) {
   2049       LLVMValueRef y_offset;
   2050       lp_build_sample_partial_offset(bld,
   2051                                      format_desc->block.height,
   2052                                      y, y_stride,
   2053                                      &y_offset, out_j);
   2054       offset = lp_build_add(bld, offset, y_offset);
   2055    }
   2056    else {
   2057       *out_j = bld->zero;
   2058    }
   2059 
   2060    if (z && z_stride) {
   2061       LLVMValueRef z_offset;
   2062       LLVMValueRef k;
   2063       lp_build_sample_partial_offset(bld,
   2064                                      1, /* pixel blocks are always 2D */
   2065                                      z, z_stride,
   2066                                      &z_offset, &k);
   2067       offset = lp_build_add(bld, offset, z_offset);
   2068    }
   2069 
   2070    *out_offset = offset;
   2071 }
   2072